You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/01/31 17:13:17 UTC

svn commit: r373853 [4/6] - in /lucene/nutch/trunk/src: java/org/apache/nutch/analysis/ java/org/apache/nutch/clustering/ java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/ java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apac...

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -25,12 +25,15 @@
 
 import org.apache.nutch.searcher.Query.Clause;
 import org.apache.nutch.searcher.Query.Phrase;
+import org.apache.nutch.util.NutchConf;
 
 /** Translate query fields to search the same-named field, as indexed by an
  * IndexingFilter.  Best for tokenized fields. */
 public abstract class FieldQueryFilter implements QueryFilter {
   private String field;
   private float boost = 1.0f;
+  private NutchConf nutchConf;
+  private CommonGrams commonGrams;
 
   /** Construct for the named field.*/
   protected FieldQueryFilter(String field) {
@@ -57,12 +60,12 @@
 
       // optimize phrase clause
       if (c.isPhrase()) {
-        String[] opt = CommonGrams.optimizePhrase(c.getPhrase(), field);
+        String[] opt = this.commonGrams.optimizePhrase(c.getPhrase(), field);
         if (opt.length==1) {
           c = new Clause(new Query.Term(opt[0]),
-                         c.isRequired(), c.isProhibited());
+                         c.isRequired(), c.isProhibited(), getConf());
         } else {
-          c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited());
+          c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf());
         }
       }
 
@@ -88,5 +91,14 @@
     
     // return the modified Lucene query
     return output;
+  }
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.commonGrams = new CommonGrams(conf);
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Tue Jan 31 08:08:58 2006
@@ -47,38 +47,43 @@
 
   private org.apache.lucene.search.Searcher luceneSearcher;
   private org.apache.lucene.index.IndexReader reader;
-
-  private LuceneQueryOptimizer optimizer = new LuceneQueryOptimizer
-    (NutchConf.get().getInt("searcher.filter.cache.size", 16),
-     NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
+  private LuceneQueryOptimizer optimizer;
+  private NutchFileSystem fs;
+  private NutchConf nutchConf;
+  private QueryFilters queryFilters;
 
   /** Construct given a number of indexes. */
-  public IndexSearcher(File[] indexDirs) throws IOException {
+  public IndexSearcher(File[] indexDirs, NutchConf nutchConf) throws IOException {
     IndexReader[] readers = new IndexReader[indexDirs.length];
+    this.nutchConf = nutchConf;
+    this.fs = NutchFileSystem.get(nutchConf);
     for (int i = 0; i < indexDirs.length; i++) {
       readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
     }
-    init(new MultiReader(readers));
+    init(new MultiReader(readers), nutchConf);
   }
 
   /** Construct given a single merged index. */
-  public IndexSearcher(File index)
+  public IndexSearcher(File index,  NutchConf nutchConf)
     throws IOException {
-    init(IndexReader.open(getDirectory(index)));
+    this.nutchConf = nutchConf;
+    this.fs = NutchFileSystem.get(nutchConf);
+    init(IndexReader.open(getDirectory(index)), nutchConf);
   }
 
-  private void init(IndexReader reader) throws IOException {
+  private void init(IndexReader reader, NutchConf nutchConf) throws IOException {
     this.reader = reader;
     this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
     this.luceneSearcher.setSimilarity(new NutchSimilarity());
+    this.optimizer = new LuceneQueryOptimizer(nutchConf);
+    this.queryFilters = new QueryFilters(nutchConf);
   }
 
   private Directory getDirectory(File file) throws IOException {
-    NutchFileSystem fs = NutchFileSystem.get();
-    if ("local".equals(fs.getName())) {
+    if ("local".equals(this.fs.getName())) {
       return FSDirectory.getDirectory(file, false);
     } else {
-      return new NdfsDirectory(fs, file, false);
+      return new NdfsDirectory(this.fs, file, false, this.nutchConf);
     }
   }
 
@@ -86,10 +91,8 @@
                      String dedupField, String sortField, boolean reverse)
 
     throws IOException {
-
     org.apache.lucene.search.BooleanQuery luceneQuery =
-      QueryFilters.filter(query);
-    
+      this.queryFilters.filter(query);
     return translateHits
       (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
                           sortField, reverse),
@@ -97,7 +100,7 @@
   }
 
   public String getExplanation(Query query, Hit hit) throws IOException {
-    return luceneSearcher.explain(QueryFilters.filter(query),
+    return luceneSearcher.explain(this.queryFilters.filter(query),
                                   hit.getIndexDocNo()).toHtml();
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java Tue Jan 31 08:08:58 2006
@@ -11,6 +11,7 @@
 import org.apache.nutch.crawl.LinkDbReader;
 import org.apache.nutch.fs.NutchFileSystem;
 import org.apache.nutch.io.UTF8;
+import org.apache.nutch.util.NutchConf;
 
 import java.io.File;
 
@@ -18,8 +19,8 @@
   
   private LinkDbReader linkdb = null;
   
-  public LinkDbInlinks(NutchFileSystem fs, File dir) {
-    linkdb = new LinkDbReader(fs, dir);
+  public LinkDbInlinks(NutchFileSystem fs, File dir, NutchConf nutchConf) {
+    linkdb = new LinkDbReader(fs, dir, nutchConf);
   }
 
   public String[] getAnchors(HitDetails details) throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Tue Jan 31 08:08:58 2006
@@ -37,8 +37,6 @@
  * which do not affect ranking but might otherwise slow search considerably. */
 class LuceneQueryOptimizer {
 
-  private static int MAX_HITS = NutchConf.get().getInt("searcher.max.hits",-1);
-
   private static class LimitExceeded extends RuntimeException {
     private int maxDoc;
     public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }    
@@ -63,18 +61,28 @@
 
   private float threshold;
 
-  /** Construct an optimizer that caches and uses filters for required clauses
+  private int searcherMaxHits;
+
+  /**
+   * Construct an optimizer that caches and uses filters for required clauses
    * whose boost is zero.
-   * @param cacheSize the number of QueryFilters to cache
-   * @param threshold the fraction of documents which must contain a term
+   * 
+   * @param cacheSize
+   *          the number of QueryFilters to cache
+   * @param threshold
+   *          the fraction of documents which must contain a term
    */
-  public LuceneQueryOptimizer(final int cacheSize, float threshold) {
+  public LuceneQueryOptimizer(NutchConf nutchConf) {
+    final int cacheSize = nutchConf.getInt("searcher.filter.cache.size", 16);
+    this.threshold = nutchConf.getFloat("searcher.filter.cache.threshold",
+        0.05f);
+    this.searcherMaxHits = nutchConf.getInt("searcher.max.hits", -1);
+    this.searcherMaxHits = searcherMaxHits;
     this.cache = new LinkedHashMap(cacheSize, 0.75f, true) {
-        protected boolean removeEldestEntry(Map.Entry eldest) {
-          return size() > cacheSize;              // limit size of cache
-        }
-      };
-    this.threshold = threshold;
+      protected boolean removeEldestEntry(Map.Entry eldest) {
+        return size() > cacheSize; // limit size of cache
+      }
+    };
   }
 
   public TopDocs optimize(BooleanQuery original,
@@ -123,7 +131,6 @@
     }
 
     Filter filter = null;
-
     if (cacheQuery.getClauses().length != 0) {
       synchronized (cache) {                      // check cache
         filter = (Filter)cache.get(cacheQuery);
@@ -151,12 +158,12 @@
     if (sortField == null && !reverse) {
 
       // no hit limit
-      if (MAX_HITS <= 0) {
+      if (this.searcherMaxHits <= 0) {
         return searcher.search(query, filter, numHits);
       }
 
       // hits limited -- use a LimitedCollector
-      LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
+      LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits);
       LimitExceeded exceeded = null;
       try {
         searcher.search(query, filter, collector);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Tue Jan 31 08:08:58 2006
@@ -42,8 +42,6 @@
     LogFormatter.setShowThreadIDs(true);
   }
 
-  private NutchFileSystem fs = NutchFileSystem.get();
-
   private String[] segmentNames;
 
   private Searcher searcher;
@@ -52,50 +50,65 @@
   private HitContent content;
   private HitInlinks linkDb;
 
-  private float RAW_HITS_FACTOR =
-    NutchConf.get().getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
 
   /** BooleanQuery won't permit more than 32 required/prohibited clauses.  We
    * don't want to use too many of those. */ 
   private static final int MAX_PROHIBITED_TERMS = 20;
+  
+  private NutchConf nutchConf;
+
+  private NutchFileSystem fs;
 
   /** Cache in servlet context. */
-  public static NutchBean get(ServletContext app) throws IOException {
+  public static NutchBean get(ServletContext app, NutchConf conf) throws IOException {
     NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
     if (bean == null) {
       LOG.info("creating new bean");
-      bean = new NutchBean();
+      bean = new NutchBean(conf);
       app.setAttribute("nutchBean", bean);
     }
     return bean;
   }
 
-  /** Construct reading from connected directory. */
-  public NutchBean() throws IOException {
-    this(new File(NutchConf.get().get("searcher.dir", "crawl")));
-  }
 
-  /** Construct in a named directory. */
-  public NutchBean(File dir) throws IOException {
-    File servers = new File(dir, "search-servers.txt");
-    if (fs.exists(servers)) {
-      LOG.info("searching servers in " + servers.getCanonicalPath());
-      init(new DistributedSearch.Client(servers));
-    } else {
-      init(new File(dir, "index"),
-           new File(dir, "indexes"),
-           new File(dir, "segments"),
-           new File(dir, "linkdb"));
-    }
+  /**
+   * 
+   * @param nutchConf
+   * @throws IOException
+   */
+  public NutchBean(NutchConf nutchConf) throws IOException {
+    this(nutchConf, null);
   }
+  
+  /**
+   *  Construct in a named directory. 
+   * @param nutchConf
+   * @param dir
+   * @throws IOException
+   */
+  public NutchBean(NutchConf nutchConf, File dir) throws IOException {
+        this.nutchConf = nutchConf;
+        this.fs = NutchFileSystem.get(this.nutchConf);
+        if (dir == null) {
+            dir = new File(this.nutchConf.get("searcher.dir", "crawl"));
+        }
+        File servers = new File(dir, "search-servers.txt");
+        if (fs.exists(servers)) {
+            LOG.info("searching servers in " + servers.getCanonicalPath());
+            init(new DistributedSearch.Client(servers, nutchConf));
+        } else {
+            init(new File(dir, "index"), new File(dir, "indexes"), new File(
+                    dir, "segments"), new File(dir, "linkdb"));
+        }
+    }
 
   private void init(File indexDir, File indexesDir, File segmentsDir,
                     File linkDb)
     throws IOException {
     IndexSearcher indexSearcher;
-    if (fs.exists(indexDir)) {
+    if (this.fs.exists(indexDir)) {
       LOG.info("opening merged index in " + indexDir);
-      indexSearcher = new IndexSearcher(indexDir);
+      indexSearcher = new IndexSearcher(indexDir, this.nutchConf);
     } else {
       LOG.info("opening indexes in " + indexesDir);
       
@@ -108,16 +121,17 @@
         }
       }
       
+      
       directories = new File[ vDirs.size() ];
       for(int i = 0; vDirs.size()>0; i++) {
         directories[i]=(File)vDirs.remove(0);
       }
       
-      indexSearcher = new IndexSearcher(directories);
+      indexSearcher = new IndexSearcher(directories, this.nutchConf);
     }
 
     LOG.info("opening segments in " + segmentsDir);
-    FetchedSegments segments = new FetchedSegments(fs, segmentsDir.toString());
+    FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.nutchConf);
     
     this.segmentNames = segments.getSegmentNames();
 
@@ -127,7 +141,7 @@
     this.content = segments;
 
     LOG.info("opening linkdb in " + linkDb);
-    this.linkDb = new LinkDbInlinks(fs, linkDb);
+    this.linkDb = new LinkDbInlinks(fs, linkDb, this.nutchConf);
   }
 
   private void init(DistributedSearch.Client client) {
@@ -216,7 +230,8 @@
     if (maxHitsPerDup <= 0)                      // disable dup checking
       return search(query, numHits, dedupField, sortField, reverse);
 
-    int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
+    float rawHitsFactor = this.nutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
+    int numHitsRaw = (int)(numHits * rawHitsFactor);
     LOG.info("searching for "+numHitsRaw+" raw hits");
     Hits hits = searcher.search(query, numHitsRaw,
                                 dedupField, sortField, reverse);
@@ -237,7 +252,7 @@
           optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
                                      dedupField);
         }
-        numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
+        numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
         LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
         hits = searcher.search(optQuery, numHitsRaw,
                                dedupField, sortField, reverse);
@@ -344,9 +359,9 @@
       System.exit(-1);
     }
 
-    NutchBean bean = new NutchBean();
-    Query query = Query.parse(args[0]);
-
+    NutchConf nutchConf = new NutchConf();
+    NutchBean bean = new NutchBean(nutchConf);
+    Query query = Query.parse(args[0], nutchConf);
     Hits hits = bean.search(query, 10);
     System.out.println("Total hits: " + hits.getTotal());
     int length = (int)Math.min(hits.getTotal(), 10);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Tue Jan 31 08:08:58 2006
@@ -31,6 +31,8 @@
 import javax.servlet.http.HttpServletResponse;
 
 import javax.xml.parsers.*;
+
+import org.apache.nutch.util.NutchConf;
 import org.w3c.dom.*;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.Transformer;
@@ -55,10 +57,12 @@
   }
 
   private NutchBean bean;
+  private NutchConf nutchConf;
 
-  public void init(ServletConfig config) throws ServletException {
+  public void init(ServletConfig config, NutchConf nutchConf) throws ServletException {
     try {
-      bean = NutchBean.get(config.getServletContext());
+      bean = NutchBean.get(config.getServletContext(), nutchConf);
+      this.nutchConf = nutchConf;
     } catch (IOException e) {
       throw new ServletException(e);
     }
@@ -114,7 +118,7 @@
         (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
         (dedupField == null ? "" : "&dedupField=" + dedupField));
 
-    Query query = Query.parse(queryString);
+    Query query = Query.parse(queryString, this.nutchConf);
     NutchBean.LOG.info("query: " + queryString);
 
     // execute the query

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
 import java.util.logging.Logger;
 
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.analysis.NutchAnalysis;
 
 import org.apache.nutch.io.Writable;
@@ -47,30 +48,34 @@
     private boolean isProhibited;
     private String field = DEFAULT_FIELD;
     private float weight = 1.0f;
-    private Object termOrPhrase; 
+    private Object termOrPhrase;
+
+    private NutchConf nutchConf; 
 
     public Clause(Term term, String field,
-                  boolean isRequired, boolean isProhibited) {
-      this(term, isRequired, isProhibited);
+                  boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+      this(term, isRequired, isProhibited, nutchConf);
       this.field = field;
     }
 
-    public Clause(Term term, boolean isRequired, boolean isProhibited) {
+    public Clause(Term term, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
       this.isRequired = isRequired;
       this.isProhibited = isProhibited;
       this.termOrPhrase = term;
+      this.nutchConf = nutchConf;
     }
 
     public Clause(Phrase phrase, String field,
-                  boolean isRequired, boolean isProhibited) {
-      this(phrase, isRequired, isProhibited);
+                  boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+      this(phrase, isRequired, isProhibited, nutchConf);
       this.field = field;
     }
 
-    public Clause(Phrase phrase, boolean isRequired, boolean isProhibited) {
+    public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
       this.isRequired = isRequired;
       this.isProhibited = isProhibited;
       this.termOrPhrase = phrase;
+      this.nutchConf = nutchConf;
     }
 
     public boolean isRequired() { return isRequired; }
@@ -104,7 +109,7 @@
         getTerm().write(out);
     }
 
-    public static Clause read(DataInput in) throws IOException {
+    public static Clause read(DataInput in, NutchConf nutchConf) throws IOException {
       byte bits = in.readByte();
       boolean required = ((bits & REQUIRED_BIT) != 0);
       boolean prohibited = ((bits & PROHIBITED_BIT) != 0);
@@ -114,9 +119,9 @@
 
       Clause clause;
       if ((bits & PHRASE_BIT) == 0) {
-        clause = new Clause(Term.read(in), field, required, prohibited);
+        clause = new Clause(Term.read(in), field, required, prohibited, nutchConf);
       } else {
-        clause = new Clause(Phrase.read(in), field, required, prohibited);
+        clause = new Clause(Phrase.read(in), field, required, prohibited, nutchConf);
       }
       clause.weight = weight;
       return clause;
@@ -135,7 +140,7 @@
         buffer.append(":");
       }
 
-      if (!isPhrase() && QueryFilters.isRawField(field)) {
+      if (!isPhrase() && new QueryFilters(nutchConf).isRawField(field)) {
         buffer.append('"');                        // quote raw terms
         buffer.append(termOrPhrase.toString());
         buffer.append('"');
@@ -274,7 +279,13 @@
 
   private ArrayList clauses = new ArrayList();
 
+  private NutchConf nutchConf;
+
   private static final Clause[] CLAUSES_PROTO = new Clause[0];
+  
+  public Query(NutchConf nutchConf) {
+      this.nutchConf = nutchConf;
+  }
 
   /** Return all clauses. */
   public Clause[] getClauses() {
@@ -288,7 +299,7 @@
 
   /** Add a required term in a specified field. */
   public void addRequiredTerm(String term, String field) {
-    clauses.add(new Clause(new Term(term), field, true, false));
+    clauses.add(new Clause(new Term(term), field, true, false, this.nutchConf));
   }
 
   /** Add a prohibited term in the default field. */
@@ -298,7 +309,7 @@
 
   /** Add a prohibited term in the specified field. */
   public void addProhibitedTerm(String term, String field) {
-    clauses.add(new Clause(new Term(term), field, false, true));
+    clauses.add(new Clause(new Term(term), field, false, true, this.nutchConf));
   }
 
   /** Add a required phrase in the default field. */
@@ -312,7 +323,7 @@
     } else if (terms.length == 1) {
       addRequiredTerm(terms[0], field);           // optimize to term query
     } else {
-      clauses.add(new Clause(new Phrase(terms), field, true, false));
+      clauses.add(new Clause(new Phrase(terms), field, true, false, this.nutchConf));
     }
   }
 
@@ -327,7 +338,7 @@
     } else if (terms.length == 1) {
       addProhibitedTerm(terms[0], field);         // optimize to term query
     } else {
-      clauses.add(new Clause(new Phrase(terms), field, false, true));
+      clauses.add(new Clause(new Phrase(terms), field, false, true, this.nutchConf));
     }
   }
 
@@ -337,8 +348,8 @@
       ((Clause)clauses.get(i)).write(out);
   }
   
-  public static Query read(DataInput in) throws IOException {
-    Query result = new Query();
+  public static Query read(DataInput in, NutchConf nutchConf) throws IOException {
+    Query result = new Query(nutchConf);
     result.readFields(in);
     return result;
   }
@@ -347,7 +358,7 @@
     clauses.clear();
     int length = in.readByte();
     for (int i = 0; i < length; i++)
-      clauses.add(Clause.read(in));
+      clauses.add(Clause.read(in, this.nutchConf));
   }
 
   public String toString() {
@@ -404,18 +415,18 @@
 
 
   /** Parse a query from a string. */
-  public static Query parse(String queryString) throws IOException {
-    return fixup(NutchAnalysis.parseQuery(queryString));
+  public static Query parse(String queryString, NutchConf nutchConf) throws IOException {
+    return fixup(NutchAnalysis.parseQuery(queryString, nutchConf), nutchConf);
   }
 
   /** Convert clauses in unknown fields to the default field. */
-  private static Query fixup(Query input) {
+  private static Query fixup(Query input, NutchConf nutchConf) {
     // walk the query
-    Query output = new Query();
+    Query output = new Query(nutchConf);
     Clause[] clauses = input.getClauses();
     for (int i = 0; i < clauses.length; i++) {
       Clause c = clauses[i];
-      if (!QueryFilters.isField(c.getField())) {  // unknown field
+      if (!new QueryFilters(nutchConf).isField(c.getField())) {  // unknown field
         ArrayList terms = new ArrayList();        // add name to query
         if (c.isPhrase()) {                       
           terms.addAll(Arrays.asList(c.getPhrase().getTerms()));
@@ -436,12 +447,13 @@
   /** For debugging. */
   public static void main(String[] args) throws Exception {
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    NutchConf nutchConf = new NutchConf();
     while (true) {
       System.out.print("Query: ");
       String line = in.readLine();
-      Query query = parse(line);
+      Query query = parse(line, nutchConf);
       System.out.println("Parsed: " + query);
-      System.out.println("Translated: " + QueryFilters.filter(query));
+      System.out.println("Translated: " + new QueryFilters(nutchConf).filter(query));
     }
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,12 +17,13 @@
 package org.apache.nutch.searcher;
 
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.nutch.util.NutchConfigurable;
 
 /** Extension point for query translation.  Permits one to add metadata to a
  * query.  All plugins found which implement this extension point are run
  * sequentially on the query.
  */
-public interface QueryFilter {
+public interface QueryFilter extends NutchConfigurable {
   /** The name of the extension point. */
   final static String X_POINT_ID = QueryFilter.class.getName();
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,8 @@
 import org.apache.nutch.plugin.*;
 import org.apache.nutch.searcher.Query.Clause;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
 import java.util.logging.Logger;
 import java.util.*;
 
@@ -35,35 +37,9 @@
   private static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.searcher.QueryFilters");
 
-  private static final QueryFilter[] CACHE;
-  private static final HashSet FIELD_NAMES = new HashSet();
-  private static final HashSet RAW_FIELD_NAMES = new HashSet();
-
-  static {
-    try {
-      ExtensionPoint point = PluginRepository.getInstance()
-        .getExtensionPoint(QueryFilter.X_POINT_ID);
-      if (point == null)
-        throw new RuntimeException(QueryFilter.X_POINT_ID+" not found.");
-      Extension[] extensions = point.getExtensions();
-      CACHE = new QueryFilter[extensions.length];
-      for (int i = 0; i < extensions.length; i++) {
-        Extension extension = extensions[i];
-        ArrayList fieldNames = parseFieldNames(extension, "fields");
-        ArrayList rawFieldNames = parseFieldNames(extension, "raw-fields");
-        if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
-          LOG.warning("QueryFilter: "+extension.getId()+" names no fields.");
-          continue;
-        }
-        CACHE[i] = (QueryFilter)extension.getExtensionInstance();
-        FIELD_NAMES.addAll(fieldNames);
-        FIELD_NAMES.addAll(rawFieldNames);
-        RAW_FIELD_NAMES.addAll(rawFieldNames);
-      }
-    } catch (PluginRuntimeException e) {
-      throw new RuntimeException(e);
-    }
-  }
+  private QueryFilter[] queryFilters;
+  private HashSet FIELD_NAMES ;
+  private HashSet RAW_FIELD_NAMES;
 
   private static ArrayList parseFieldNames(Extension extension,
                                            String attribute) {
@@ -72,10 +48,50 @@
     return Collections.list(new StringTokenizer(fields, " ,\t\n\r"));
   }
 
-  private  QueryFilters() {}                  // no public ctor
+  public QueryFilters(NutchConf nutchConf) {
+    this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+        .getName());
+    if (this.queryFilters == null) {
+      try {
+        ExtensionPoint point = nutchConf.getPluginRepository()
+            .getExtensionPoint(QueryFilter.X_POINT_ID);
+        if (point == null)
+          throw new RuntimeException(QueryFilter.X_POINT_ID + " not found.");
+        Extension[] extensions = point.getExtensions();
+        QueryFilter[] filters = new QueryFilter[extensions.length];
+        for (int i = 0; i < extensions.length; i++) {
+          Extension extension = extensions[i];
+          ArrayList fieldNames = parseFieldNames(extension, "fields");
+          ArrayList rawFieldNames = parseFieldNames(extension, "raw-fields");
+          if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
+            LOG.warning("QueryFilter: " + extension.getId()
+                + " names no fields.");
+            continue;
+          }
+          filters[i] = (QueryFilter) extension.getExtensionInstance();
+          FIELD_NAMES = new HashSet();
+          FIELD_NAMES.addAll(fieldNames);
+          FIELD_NAMES.addAll(rawFieldNames);
+          nutchConf.setObject("FIELD_NAMES", FIELD_NAMES);
+          RAW_FIELD_NAMES = new HashSet();
+          RAW_FIELD_NAMES.addAll(rawFieldNames);
+          nutchConf.setObject("RAW_FIELD_NAMES", RAW_FIELD_NAMES);
+        }
+        nutchConf.setObject(QueryFilter.class.getName(), filters);
+      } catch (PluginRuntimeException e) {
+        throw new RuntimeException(e);
+      }
+      this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+          .getName());
+    } else {
+      // cache already filled
+      FIELD_NAMES = (HashSet) nutchConf.getObject("FIELD_NAMES");
+      RAW_FIELD_NAMES = (HashSet) nutchConf.getObject("RAW_FIELD_NAMES");
+    }
+  }              
 
   /** Run all defined filters. */
-  public static BooleanQuery filter(Query input) throws QueryException {
+  public BooleanQuery filter(Query input) throws QueryException {
     // first check that all field names are claimed by some plugin
     Clause[] clauses = input.getClauses();
     for (int i = 0; i < clauses.length; i++) {
@@ -86,16 +102,17 @@
 
     // then run each plugin
     BooleanQuery output = new BooleanQuery();
-    for (int i = 0 ; i < CACHE.length; i++) {
-      output = CACHE[i].filter(input, output);
+    for (int i = 0; i < this.queryFilters.length; i++) {
+      output = this.queryFilters[i].filter(input, output);
     }
     return output;
   }
 
-  public static boolean isField(String name) {
+  public boolean isField(String name) {
     return FIELD_NAMES.contains(name);
   }
-  public static boolean isRawField(String name) {
+  
+  public boolean isRawField(String name) {
     return RAW_FIELD_NAMES.contains(name);
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java Tue Jan 31 08:08:58 2006
@@ -29,21 +29,23 @@
 
 /** Implements hit summarization. */
 public class Summarizer {
-
-  /** The number of context terms to display preceding and following matches.*/
-  private static final int SUM_CONTEXT =
-    NutchConf.get().getInt("searcher.summary.context", 5);
-
-  /** The total number of terms to display in a summary.*/
-  private static final int SUM_LENGTH = 
-    NutchConf.get().getInt("searcher.summary.length", 20);
-
+   
   /** Converts text to tokens. */
-  private static final Analyzer ANALYZER = new NutchDocumentAnalyzer();
+  private Analyzer ANALYZER;
+  private NutchConf nutchConf;
 
   /**
-   * Class Excerpt represents a single passage found in the
-   * document, with some appropriate regions highlit.
+   * The constructor.
+   * @param conf
+   */
+  public Summarizer(NutchConf conf) {
+    this.nutchConf = conf;
+    this.ANALYZER = new NutchDocumentAnalyzer(conf);
+  }
+
+  /**
+   * Class Excerpt represents a single passage found in the document, with some
+   * appropriate regions highlit.
    */
   class Excerpt {
       Vector passages = new Vector();
@@ -54,7 +56,7 @@
        */
       public Excerpt() {
       }
-
+      
       /**
        */
       public void addToken(String token) {
@@ -99,7 +101,7 @@
   }
 
   /** Returns a summary for the given pre-tokenized text. */
-  public Summary getSummary(String text, Query query) throws IOException {
+  public Summary getSummary(String text, Query query, int sumContext, int sumLength) throws IOException {
 
     // Simplistic implementation.  Finds the first fragments in the document
     // containing any query terms.
@@ -161,8 +163,8 @@
         // Start searching at a point SUM_CONTEXT terms back,
         // and move SUM_CONTEXT terms into the future.
         //
-        int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
-        int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
+        int startToken = (i > sumContext) ? i - sumContext : 0;
+        int endToken = Math.min(i + sumContext, tokens.length);
         int offset = tokens[startToken].startOffset();
         int j = startToken;
 
@@ -181,7 +183,7 @@
         // the document and we haven't hit the max-number-of-items
         // -in-a-summary.
         //
-        while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
+        while ((j < endToken) && (j - startToken < sumLength)) {
           //
           // Now grab the hit-element, if present
           //
@@ -191,7 +193,7 @@
             excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
             excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
             offset = t.endOffset();
-            endToken = Math.min(j+SUM_CONTEXT, tokens.length);
+            endToken = Math.min(j + sumContext, tokens.length);
           }
 
           j++;
@@ -226,7 +228,7 @@
         // Start SUM_CONTEXT places away.  The next
         // search for relevant excerpts begins at i-SUM_CONTEXT
         //
-        i = j+SUM_CONTEXT;
+        i = j + sumContext;
       }
     }
 
@@ -236,7 +238,7 @@
     //
     if (excerptSet.size() == 0) {
         Excerpt excerpt = new Excerpt();
-        int excerptLen = Math.min(SUM_LENGTH, tokens.length);
+        int excerptLen = Math.min(sumLength, tokens.length);
         lastExcerptPos = excerptLen;
 
         excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
@@ -250,7 +252,7 @@
     //
     double tokenCount = 0;
     Summary s = new Summary();
-    while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
+    while (tokenCount <= sumLength && excerptSet.size() > 0) {
         Excerpt excerpt = (Excerpt) excerptSet.last();
         excerptSet.remove(excerpt);
 
@@ -258,7 +260,7 @@
         for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
             Fragment f = (Fragment) e.nextElement();
             // Don't add fragments if it takes us over the max-limit
-            if (tokenCount + tokenFraction <= SUM_LENGTH) {
+            if (tokenCount + tokenFraction <= sumLength) {
                 s.add(f);
             }
             tokenCount += tokenFraction;
@@ -290,7 +292,7 @@
             return;
         }
 
-        Summarizer s = new Summarizer();
+        Summarizer s = new Summarizer(new NutchConf());
 
         //
         // Parse the args
@@ -318,8 +320,11 @@
             in.close();
         }
 
+        NutchConf nutchConf = new NutchConf();
+        int sumContext = nutchConf.getInt("searcher.summary.context", 5);
+        int sumLength = nutchConf.getInt("searcher.summary.length", 20);
         // Convert the query string into a proper Query
-        Query query = Query.parse(queryBuf.toString());
-        System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'");
+        Query query = Query.parse(queryBuf.toString(), nutchConf);
+        System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'");
     }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Tue Jan 31 08:08:58 2006
@@ -68,7 +68,7 @@
       throws IOException {
       reporter.setStatus(split.toString());
 
-      return new SequenceFileRecordReader(fs, split) {
+      return new SequenceFileRecordReader(job, split) {
           public synchronized boolean next(Writable key, Writable value)
             throws IOException {
             ObjectWritable wrapper = (ObjectWritable)value;
@@ -219,7 +219,8 @@
   }
   
   public static void main(String[] args) throws Exception {
-    SegmentReader segmentReader = new SegmentReader(NutchConf.get());
+    NutchConf nutchConf = new NutchConf();
+    SegmentReader segmentReader = new SegmentReader(nutchConf);
 
     String usage = "Usage: SegmentReader <segment>";
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Tue Jan 31 08:08:58 2006
@@ -47,9 +47,9 @@
 
   NutchBean bean = null;
 
-  public void init() {
+  public void init(NutchConf nutchConf) {
     try {
-      bean = NutchBean.get(this.getServletContext());
+      bean = NutchBean.get(this.getServletContext(), nutchConf);
     } catch (IOException e) {
       // nothing
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Tue Jan 31 08:08:58 2006
@@ -338,7 +338,8 @@
     Pattern topicPattern = null; 
     Vector topics = new Vector(); 
     
-    NutchFileSystem nfs = NutchFileSystem.get();
+    NutchConf nutchConf = new NutchConf();
+    NutchFileSystem nfs = NutchFileSystem.get(nutchConf);
     try {
       for (int i = 1; i < argv.length; i++) {
         if ("-includeAdultMaterial".equals(argv[i])) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java Tue Jan 31 08:08:58 2006
@@ -459,8 +459,9 @@
     if (qPath != null) {
       is = new FileInputStream(qPath);
     } else {
-      qPath = NutchConf.get().get("prune.index.tool.queries");
-      is = NutchConf.get().getConfResourceAsInputStream(qPath);
+        NutchConf nutchConf = new NutchConf();
+        qPath = nutchConf.get("prune.index.tool.queries");
+        is = nutchConf.getConfResourceAsInputStream(qPath);
     }
     if (is == null) {
       LOG.severe("Can't load queries from " + qPath);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java Tue Jan 31 08:08:58 2006
@@ -18,11 +18,12 @@
 
 import java.util.*;
 import java.net.URL;
-import java.net.URLClassLoader;
-import java.net.MalformedURLException;
 import java.io.*;
 import java.util.logging.Logger;
+
 import javax.xml.parsers.*;
+
+import org.apache.nutch.plugin.PluginRepository;
 import org.w3c.dom.*;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.Transformer;
@@ -45,16 +46,13 @@
   private static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.util.NutchConf");
 
-  private static final NutchConf DEFAULT = new NutchConf();
-
-  /** Return the default configuration. */
-  public static NutchConf get() { return DEFAULT; }
-
   private ArrayList resourceNames = new ArrayList();
   private Properties properties;
   private ClassLoader classLoader = 
     Thread.currentThread().getContextClassLoader();
 
+  private PluginRepository pluginRepository;
+  
   /** A new configuration. */
   public NutchConf() {
     resourceNames.add("nutch-default.xml");
@@ -89,9 +87,21 @@
     resourceNames.add(resourceNames.size()-1, name); // add second to last
     properties = null;                            // trigger reload
   }
+  
+  /**
+   * @return a cached instance of the plugin repository
+   */
+  public PluginRepository getPluginRepository() {
+    if (this.pluginRepository == null) {
+      this.pluginRepository = new PluginRepository(this);
+    }
+    return this.pluginRepository;
+  }
 
-  /** Returns the value of the <code>name</code> property, or null if no
-   * such property exists. */
+  /**
+   * Returns the value of the <code>name</code> property, or null if no such
+   * property exists.
+   */
   public Object getObject(String name) { return getProps().get(name);}
 
   /** Sets the value of the <code>name</code> property. */
@@ -390,8 +400,13 @@
       conf.appendChild(doc.createTextNode("\n"));
       for (Enumeration e = properties.keys(); e.hasMoreElements();) {
         String name = (String)e.nextElement();
-        String value = (String)properties.get(name);
-      
+        Object object = properties.get(name);
+        String value = null;
+        if(object instanceof String) {
+          value = (String) object;
+        }else {
+          continue;
+        }
         Element propNode = doc.createElement("property");
         conf.appendChild(propNode);
       
@@ -437,7 +452,7 @@
 
   /** For debugging.  List non-default properties to the terminal and exit. */
   public static void main(String[] args) throws Exception {
-    get().write(System.out);
+    new NutchConf().write(System.out);
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -30,6 +30,7 @@
 
 import java.util.logging.Logger;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 import java.util.*;
 import java.net.URL;
@@ -43,6 +44,8 @@
   /** The name of the document field we use. */
   public static String FIELD = "cc";
 
+  private NutchConf nutchConf;
+
   public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
     
@@ -98,6 +101,14 @@
   
   private void addFeature(Document doc, String feature) {
     doc.add(Field.Keyword(FIELD, feature));
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Tue Jan 31 08:08:58 2006
@@ -36,8 +36,6 @@
   public static final Logger LOG
     = LogFormatter.getLogger(CCParseFilter.class.getName());
 
-  private static final boolean EXCLUDE_UNLICENSED =
-    NutchConf.get().getBoolean("creativecommons.exclude.unlicensed", false);
 
   /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
   public static class Walker {
@@ -52,7 +50,7 @@
     }
 
     /** Scan the document adding attributes to metadata.*/
-    public static void walk(Node doc, URL base, ContentProperties metadata)
+    public static void walk(Node doc, URL base, ContentProperties metadata, NutchConf nutchConf)
       throws ParseException {
 
       // walk the DOM tree, scanning for license data
@@ -71,7 +69,7 @@
       } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
         licenseLocation = "a";
         licenseUrl = walker.anchorLicense.toString();
-      } else if (EXCLUDE_UNLICENSED) {
+      } else if (nutchConf.getBoolean("creativecommons.exclude.unlicensed", false)) {
         throw new ParseException("No CC license.  Excluding.");
       }
 
@@ -251,6 +249,8 @@
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
   }
 
+  private NutchConf nutchConf;
+
   /** Adds metadata or otherwise modifies a parse of an HTML document, given
    * the DOM tree of a page. */
   public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -260,17 +260,24 @@
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
 
     try {
       // extract license metadata
-      Walker.walk(doc, base, parse.getData().getMetadata());
+      Walker.walk(doc, base, parse.getData().getMetadata(), getConf());
     } catch (ParseException e) {
-      return new ParseStatus(e).getEmptyParse();
+      return new ParseStatus(e).getEmptyParse(getConf());
     }
 
     return parse;
   }
 
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,11 +17,24 @@
 package org.creativecommons.nutch;
 
 import org.apache.nutch.searcher.RawFieldQueryFilter;
+import org.apache.nutch.util.NutchConf;
 
-/** Handles "cc:" query clauses, causing them to search the "cc" field
- * indexed by CCIndexingFilter. */
+/**
+ * Handles "cc:" query clauses, causing them to search the "cc" field indexed by
+ * CCIndexingFilter.
+ */
 public class CCQueryFilter extends RawFieldQueryFilter {
+  private NutchConf nutchConf;
+
   public CCQueryFilter() {
     super(CCIndexingFilter.FIELD);
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Jan 31 08:08:58 2006
@@ -1,68 +1,70 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ContentProperties;
-
-import java.util.Properties;
-import java.io.*;
-import java.net.URL;
-
-import junit.framework.TestCase;
-
-public class TestCCParseFilter extends TestCase {
-
-  private static final File testDir =
-    new File(System.getProperty("test.input"));
-
-  public void testPages() throws Exception {
-    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
-    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
-    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
-  }
-
-  public void pageTest(File file, String url,
-                       String license, String location, String type)
-    throws Exception {
-
-    String contentType = "text/html";
-    InputStream in = new FileInputStream(file);
-    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
-    byte[] buffer = new byte[1024];
-    int i;
-    while ((i = in.read(buffer)) != -1) {
-      out.write(buffer, 0, i);
-    }
-    in.close();
-    byte[] bytes = out.toByteArray();
-
-    Content content =
-      new Content(url, url, bytes, contentType, new ContentProperties());
-    Parse parse = ParseUtil.parseByParserId("parse-html",content);
-
-    ContentProperties metadata = parse.getData().getMetadata();
-    assertEquals(license, metadata.get("License-Url"));
-    assertEquals(location, metadata.get("License-Location"));
-    assertEquals(type, metadata.get("Work-Type"));
-  }
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
+
+import java.util.Properties;
+import java.io.*;
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+public class TestCCParseFilter extends TestCase {
+
+  private static final File testDir =
+    new File(System.getProperty("test.input"));
+
+  public void testPages() throws Exception {
+    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+             "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+             "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+             "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+  }
+
+  public void pageTest(File file, String url,
+                       String license, String location, String type)
+    throws Exception {
+
+    String contentType = "text/html";
+    InputStream in = new FileInputStream(file);
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
+    byte[] buffer = new byte[1024];
+    int i;
+    while ((i = in.read(buffer)) != -1) {
+      out.write(buffer, 0, i);
+    }
+    in.close();
+    byte[] bytes = out.toByteArray();
+    NutchConf nutchConf = new NutchConf();
+
+    Content content =
+      new Content(url, url, bytes, contentType, new ContentProperties(), nutchConf);
+    Parse parse = new ParseUtil(nutchConf).parseByParserId("parse-html",content);
+
+    ContentProperties metadata = parse.getData().getMetadata();
+    assertEquals(license, metadata.get("License-Url"));
+    assertEquals(location, metadata.get("License-Location"));
+    assertEquals(type, metadata.get("Work-Type"));
+  }
+}

Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -40,8 +40,8 @@
   public static final Logger LOG
     = LogFormatter.getLogger(BasicIndexingFilter.class.getName());
 
-  private static final int MAX_TITLE_LENGTH =
-    NutchConf.get().getInt("indexer.max.title.length", 100);
+  private int MAX_TITLE_LENGTH;
+  private NutchConf nutchConf;
 
   public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
@@ -87,6 +87,15 @@
     doc.add(Field.Text("title", title));
 
     return doc;
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -75,13 +75,10 @@
     = LogFormatter.getLogger(MoreIndexingFilter.class.getName());
 
   /** A flag that tells if magic resolution must be performed */
-  private final static boolean MAGIC =
-        NutchConf.get().getBoolean("mime.type.magic", true);
+  private boolean MAGIC;
 
   /** Get the MimeTypes resolver instance. */
-  private final static MimeTypes MIME = 
-        MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
+  private MimeTypes MIME; 
   
   public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
     throws IndexingException {
@@ -247,6 +244,8 @@
   // HTTP header "Content-Disposition". Typically it looks like:
   // Content-Disposition: inline; filename="foo.ppt"
   private PatternMatcher matcher = new Perl5Matcher();
+
+  private NutchConf nutchConf;
   static Perl5Pattern patterns[] = {null, null};
   static {
     Perl5Compiler compiler = new Perl5Compiler();
@@ -300,6 +299,16 @@
     }
 
     return normalized;
+  }
+
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    MAGIC = conf.getBoolean("mime.type.magic", true);
+    MIME = MimeTypes.get(getConf().get("mime.types.file"));
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 
 }

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 // DOM imports
 import org.w3c.dom.DocumentFragment;
@@ -67,6 +68,8 @@
       LOG.severe(e.toString());
     }
   }
+
+  private NutchConf nutchConf;
   
 
   
@@ -195,5 +198,11 @@
     
   }
 
-      
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Tue Jan 31 08:08:58 2006
@@ -95,12 +95,12 @@
   /**
    * Constructs a new Language Identifier.
    */
-  private LanguageIdentifier() {
+  public LanguageIdentifier(NutchConf nutchConf) {
 
     // Gets ngram sizes to take into account from the Nutch Config
-    minLength = NutchConf.get().getInt("lang.ngram.min.length",
+    minLength = nutchConf.getInt("lang.ngram.min.length",
                                        NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
-    maxLength = NutchConf.get().getInt("lang.ngram.max.length",
+    maxLength = nutchConf.getInt("lang.ngram.max.length",
                                        NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
     // Ensure the min and max values are in an acceptale range
     // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
@@ -110,7 +110,7 @@
     minLength = Math.min(minLength, maxLength);
 
     // Gets the value of the maximum size of data to analyze
-    analyzeLength = NutchConf.get().getInt("lang.analyze.max.length",
+    analyzeLength = nutchConf.getInt("lang.analyze.max.length",
                                            DEFAULT_ANALYSIS_LENGTH);
     
     Properties p = new Properties();
@@ -174,20 +174,6 @@
     }
   }
 
-  /**
-   * Get a LanguageIdentifier instance.
-   * @return the LanguageIdentifier singleton instance.
-   */
-  public static LanguageIdentifier getInstance() {
-    if (identifier == null) {
-        synchronized(LanguageIdentifier.class) {
-            if (identifier == null) {
-                identifier = new LanguageIdentifier();
-            }
-        }
-    }
-    return identifier;
-  }
 
   /**
    * Main method used for command line process.
@@ -272,9 +258,10 @@
 
     }
 
+    NutchConf nutchConf = new NutchConf();
     String lang = null;
     //LanguageIdentifier idfr = LanguageIdentifier.getInstance();
-    LanguageIdentifier idfr = new LanguageIdentifier();
+    LanguageIdentifier idfr = new LanguageIdentifier(nutchConf);
     File f;
     FileInputStream fis;
     try {
@@ -292,7 +279,7 @@
           break;
 
         case IDURL:
-          text = getUrlContent(filename);
+          text = getUrlContent(filename, nutchConf);
           lang = idfr.identify(text);
           break;
 
@@ -348,13 +335,13 @@
    * @param url
    * @return contents of url
    */
-  private static String getUrlContent(String url) {
+  private static String getUrlContent(String url, NutchConf nutchConf) {
     Protocol protocol;
     try {
-      protocol = ProtocolFactory.getProtocol(url);
+      protocol = new ProtocolFactory(nutchConf).getProtocol(url);
       Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
       String contentType = content.getContentType();
-      Parser parser = ParserFactory.getParser(contentType, url);
+      Parser parser = new ParserFactory(nutchConf).getParser(contentType, url);
       Parse parse = parser.getParse(content);
       System.out.println("text:" + parse.getText());
       return parse.getText();

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConf;
 
 // Lucene imports
 import org.apache.lucene.document.Field;
@@ -49,7 +50,10 @@
 public class LanguageIndexingFilter implements IndexingFilter {
   
 
-  /**
+  private NutchConf nutchConf;
+  private LanguageIdentifier languageIdentifier;
+
+/**
    * Constructs a new Language Indexing Filter.
    */
   public LanguageIndexingFilter() {
@@ -77,7 +81,7 @@
       text.append(parse.getData().getTitle())
           .append(" ")
           .append(parse.getText());
-      lang = LanguageIdentifier.getInstance().identify(text);
+      lang = this.languageIdentifier.identify(text);
     }
 
     if (lang == null) {
@@ -88,5 +92,13 @@
 
     return doc;
   }
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+    this.languageIdentifier = new LanguageIdentifier(conf);
+  }
 
+  public NutchConf getConf() {
+    return this.nutchConf;
+  }
 }

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,11 +17,22 @@
 package org.apache.nutch.analysis.lang;
 
 import org.apache.nutch.searcher.RawFieldQueryFilter;
+import org.apache.nutch.util.NutchConf;
 
 /** Handles "lang:" query clauses, causing them to search the "lang" field
  * indexed by LanguageIdentifier. */
 public class LanguageQueryFilter extends RawFieldQueryFilter {
+  private NutchConf nutchConf;
+
   public LanguageQueryFilter() {
     super("lang");
+  }
+  
+  public void setConf(NutchConf conf) {
+    this.nutchConf = conf;
+  }
+
+  public NutchConf getConf() {
+    return this.nutchConf;
   }
 }

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
 
 
 public class TestHTMLLanguageParser extends TestCase {
@@ -52,7 +53,7 @@
       for (int t = 0; t < docs.length; t++) {
 
         Content content = getContent(docs[t]);
-        Parser parser = ParserFactory.getParser("text/html", URL);
+        Parser parser = new ParserFactory(new NutchConf()).getParser("text/html", URL);
         Parse parse = parser.getParse(content);
 
         assertEquals(metalanguages[t], (String) parse.getData().get(
@@ -125,7 +126,7 @@
     ContentProperties p = new ContentProperties();
     p.put("Content-Type", "text/html");
 
-    Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
+    Content content = new Content(URL, BASE, text.getBytes(), "text/html", p, new NutchConf());
     return content;
   }
 

Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Tue Jan 31 08:08:58 2006
@@ -32,6 +32,7 @@
 
 // Lucene imports
 import org.apache.lucene.analysis.Token;
+import org.apache.nutch.util.NutchConf;
 
 
 /**
@@ -204,7 +205,7 @@
     public void testIdentify() {
         try {
             long total = 0;
-            LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+            LanguageIdentifier idfr = new LanguageIdentifier(new NutchConf());
             BufferedReader in = new BufferedReader(new InputStreamReader(
                         this.getClass().getResourceAsStream("test-referencial.txt")));
             String line = null;

Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
 import java.util.LinkedList;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.net.protocols.Response;
@@ -43,35 +44,46 @@
   
   public static final int BUFFER_SIZE = 8 * 1024;
   
-  public static String PROXY_HOST =
-          NutchConf.get().get("http.proxy.host");
-  
-  public static int PROXY_PORT =
-          NutchConf.get().getInt("http.proxy.port", 8080);
-  
-  public static boolean PROXY =
-          (PROXY_HOST != null && PROXY_HOST.length() > 0);
-  
-  public static int TIMEOUT =
-          NutchConf.get().getInt("http.timeout", 10000);
-  
-  public static int MAX_CONTENT =
-          NutchConf.get().getInt("http.content.limit", 64 * 1024);
-  
-  public static int MAX_DELAYS =
-          NutchConf.get().getInt("http.max.delays", 3);
-  
-  public static int MAX_THREADS_PER_HOST =
-          NutchConf.get().getInt("fetcher.threads.per.host", 1);
-  
-  public static String AGENT_STRING =
-          getAgentString();
-  
-  public static long SERVER_DELAY =
-          (long) (NutchConf.get().getFloat("fetcher.server.delay", 1.0f) * 1000);
-  
-
   private static final byte[] EMPTY_CONTENT = new byte[0];
+
+  private RobotRulesParser robots = null;
+ 
+  /** The proxy hostname. */ 
+  protected String proxyHost = null;
+
+  /** The proxy port. */
+  protected int proxyPort = 8080; 
+
+  /** Indicates if a proxy is used */
+  protected boolean useProxy = false;
+
+  /** The network timeout in millisecond */
+  protected int timeout = 10000;
+
+  /** The length limit for downloaded content, in bytes. */
+  protected int maxContent = 64 * 1024; 
+
+  /** The number of times a thread will delay when trying to fetch a page. */
+  protected int maxDelays = 3;
+
+  /**
+   * The maximum number of threads that should be allowed
+   * to access a host at one time.
+   */
+  protected int maxThreadsPerHost = 1; 
+
+  /**
+   * The number of seconds the fetcher will delay between
+   * successive requests to the same server.
+   */
+  protected long serverDelay = 1000;
+
+  /** The Nutch 'User-Agent' request header */
+  protected String userAgent = getAgentString(
+                        "NutchCVS", null, "Nutch",
+                        "http://lucene.apache.org/nutch/bot.html",
+                        "nutch-agent@lucene.apache.org");
+
     
   /**
    * Maps from InetAddress to a Long naming the time it should be unblocked.
@@ -97,7 +109,10 @@
 
   /** The specified logger */
   private Logger logger = LOGGER;
-  
+ 
+  /** The nutch configuration */
+  private NutchConf conf = null;
+ 
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -109,14 +124,32 @@
     if (logger != null) {
       this.logger = logger;
     }
-    logger.fine("http.proxy.host = " + PROXY_HOST);
-    logger.fine("http.proxy.port = " + PROXY_PORT);
-    logger.fine("http.timeout = " + TIMEOUT);
-    logger.fine("http.content.limit = " + MAX_CONTENT);
-    logger.fine("http.agent = " + AGENT_STRING);
-    logger.fine("fetcher.server.delay = " + SERVER_DELAY);
-    logger.fine("http.max.delays = " + MAX_DELAYS);
+    robots = new RobotRulesParser();
+  }
+  
+   // Inherited Javadoc
+    public void setConf(NutchConf conf) {
+        this.conf = conf;
+        this.proxyHost = conf.get("http.proxy.host");
+        this.proxyPort = conf.getInt("http.proxy.port", 8080);
+        this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+        this.timeout = conf.getInt("http.timeout", 10000);
+        this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+        this.maxDelays = conf.getInt("http.max.delays", 3);
+        this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1);
+        this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
+                .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
+        this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
+        this.robots.setConf(conf);
+        logConf();
+    }
+
+  // Inherited Javadoc
+  public NutchConf getConf() {
+    return this.conf;
   }
+   
+  
   
   public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
     
@@ -125,7 +158,7 @@
       URL u = new URL(urlString);
       
       try {
-        if (!RobotRulesParser.isAllowed(this, u)) {
+        if (!robots.isAllowed(this, u)) {
           return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
         }
       } catch (Throwable e) {
@@ -146,7 +179,7 @@
       Content c = new Content(u.toString(), u.toString(),
                               (content == null ? EMPTY_CONTENT : content),
                               response.getHeader("Content-Type"),
-                              response.getHeaders());
+                              response.getHeaders(), this.conf);
       
       if (code == 200) { // got a good response
         return new ProtocolOutput(c); // return it
@@ -203,8 +236,49 @@
     }
   }
   
-  
-  private static InetAddress blockAddr(URL url) throws ProtocolException {
+  /* -------------------------- *
+   * </implementation:Protocol> *
+   * -------------------------- */
+
+
+  public String getProxyHost() {
+    return proxyHost;
+  }
+
+  public int getProxyPort() {
+    return proxyPort;
+  }
+
+  public boolean useProxy() {
+    return useProxy;
+  }
+
+  public int getTimeout() {
+    return timeout;
+  }
+
+  public int getMaxContent() {
+    return maxContent;
+  }
+
+  public int getMaxDelays() {
+    return maxDelays;
+  }
+
+  public int getMaxThreadsPerHost() {
+    return maxThreadsPerHost;
+  }
+
+  public long getServerDelay() {
+    return serverDelay;
+  }
+
+  public String getUserAgent() {
+    return userAgent;
+  }
+
+
+  private InetAddress blockAddr(URL url) throws ProtocolException {
     
     InetAddress addr;
     try {
@@ -229,21 +303,21 @@
           count++;                              // increment & store
           THREADS_PER_HOST_COUNT.put(addr, new Integer(count));
           
-          if (count >= MAX_THREADS_PER_HOST) {
+          if (count >= maxThreadsPerHost) {
             BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
           }
           return addr;
         }
       }
       
-      if (delays == MAX_DELAYS)
+      if (delays == maxDelays)
         throw new HttpException("Exceeded http.max.delays: retry later.");
       
       long done = time.longValue();
       long now = System.currentTimeMillis();
       long sleep = 0;
       if (done == 0) {                            // address is still in use
-        sleep = SERVER_DELAY;                     // wait at least delay
+        sleep = serverDelay;                      // wait at least delay
         
       } else if (now < done) {                    // address is on hold
         sleep = done - now;                       // wait until its free
@@ -256,14 +330,14 @@
     }
   }
   
-  private static void unblockAddr(InetAddress addr) {
+  private void unblockAddr(InetAddress addr) {
     synchronized (BLOCKED_ADDR_TO_TIME) {
       int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
       if (addrCount == 1) {
         THREADS_PER_HOST_COUNT.remove(addr);
         BLOCKED_ADDR_QUEUE.addFirst(addr);
         BLOCKED_ADDR_TO_TIME.put
-                (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+                (addr, new Long(System.currentTimeMillis() + serverDelay));
       } else {
         THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
       }
@@ -285,13 +359,11 @@
     }
   }
   
-  private static String getAgentString() {
-    
-    String agentName = NutchConf.get().get("http.agent.name");
-    String agentVersion = NutchConf.get().get("http.agent.version");
-    String agentDesc = NutchConf.get().get("http.agent.description");
-    String agentURL = NutchConf.get().get("http.agent.url");
-    String agentEmail = NutchConf.get().get("http.agent.email");
+  private static String getAgentString(String agentName,
+                                       String agentVersion,
+                                       String agentDesc,
+                                       String agentURL,
+                                       String agentEmail) {
     
     if ( (agentName == null) || (agentName.trim().length() == 0) )
       LOGGER.severe("No User-Agent string set (http.agent.name)!");
@@ -327,6 +399,16 @@
     }
     return buf.toString();
   }
+
+  protected void logConf() {
+    logger.info("http.proxy.host = " + proxyHost);
+    logger.info("http.proxy.port = " + proxyPort);
+    logger.info("http.timeout = " + timeout);
+    logger.info("http.content.limit = " + maxContent);
+    logger.info("http.agent = " + userAgent);
+    logger.info("fetcher.server.delay = " + serverDelay);
+    logger.info("http.max.delays = " + maxDelays);
+  }
   
   protected static void main(HttpBase http, String[] args) throws Exception {
     boolean verbose = false;
@@ -341,7 +423,7 @@
     
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-timeout")) { // found -timeout option
-        TIMEOUT = Integer.parseInt(args[++i]) * 1000;
+        http.timeout = Integer.parseInt(args[++i]) * 1000;
       } else if (args[i].equals("-verbose")) { // found -verbose option
         verbose = true;
       } else if (i != args.length - 1) {

Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Jan 31 08:08:58 2006
@@ -30,6 +30,7 @@
 
 // Nutch imports
 import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.protocols.Response;
@@ -45,14 +46,13 @@
  * @author Mike Cafarella
  * @author Doug Cutting
  */
-public class RobotRulesParser {
+public class RobotRulesParser implements NutchConfigurable {
+  
   public static final Logger LOG=
-    LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+    LogFormatter.getLogger(RobotRulesParser.class.getName());
 
-  private static final boolean ALLOW_FORBIDDEN =
-    NutchConf.get().getBoolean("http.robots.403.allow", false);
+  private boolean allowForbidden = false;
 
-  private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
   private static final String CHARACTER_ENCODING= "UTF-8";
@@ -60,9 +60,9 @@
     
   private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
 
-  private static RobotRuleSet FORBID_ALL_RULES =
-    new RobotRulesParser().getForbidAllRules();
+  private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();
 
+  private NutchConf conf;
   private HashMap robotNames;
 
   /**
@@ -87,14 +87,6 @@
     }
 
     /**
-     * should not be instantiated from outside RobotRulesParser
-     */
-    private RobotRuleSet() {
-      tmpEntries= new ArrayList();
-      entries= null;
-    }
-
-    /**
      */
     private void addPrefix(String prefix, boolean allow) {
       if (tmpEntries == null) {
@@ -182,14 +174,25 @@
   }
 
 
-  public RobotRulesParser() { this(AGENTS); }
+  RobotRulesParser() { }
+
+  public RobotRulesParser(NutchConf conf) {
+    setConf(conf);
+  }
 
-  private static String[] getAgents() {
+
+  /* ---------------------------------- *
+   * <implementation:NutchConfigurable> *
+   * ---------------------------------- */
+
+  public void setConf(NutchConf conf) {
+    this.conf = conf;
+    allowForbidden = conf.getBoolean("http.robots.403.allow", false);
     //
     // Grab the agent names we advertise to robots files.
     //
-    String agentName = NutchConf.get().get("http.agent.name");
-    String agentNames = NutchConf.get().get("http.robots.agents");
+    String agentName = conf.get("http.agent.name");
+    String agentNames = conf.get("http.robots.agents");
     StringTokenizer tok = new StringTokenizer(agentNames, ",");
     ArrayList agents = new ArrayList();
     while (tok.hasMoreTokens()) {
@@ -197,22 +200,38 @@
     }
 
     //
-    // If there are no agents for robots-parsing, use our 
+    // If there are no agents for robots-parsing, use our
     // default agent-string.  If both are present, our agent-string
     // should be the first one we advertise to robots-parsing.
-    // 
+    //
     if (agents.size() == 0) {
       agents.add(agentName);
       LOG.severe("No agents listed in 'http.robots.agents' property!");
     } else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
       agents.add(0, agentName);
-      LOG.severe("Agent we advertise (" + agentName 
+      LOG.severe("Agent we advertise (" + agentName
                  + ") not listed first in 'http.robots.agents' property!");
     }
+    setRobotNames((String[]) agents.toArray(new String[agents.size()]));
+  }
 
-    return (String[])agents.toArray(new String[agents.size()]);
+  public NutchConf getConf() {
+    return conf;
   }
 
+  /* ---------------------------------- *
+   * <implementation:NutchConfigurable> *
+   * ---------------------------------- */
+
+  private void setRobotNames(String[] robotNames) {
+    this.robotNames= new HashMap();
+    for (int i= 0; i < robotNames.length; i++) {
+      this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
+    }
+    // always make sure "*" is included
+    if (!this.robotNames.containsKey("*"))
+      this.robotNames.put("*", new Integer(robotNames.length));
+  }
 
   /**
    *  Creates a new <code>RobotRulesParser</code> which will use the
@@ -223,14 +242,8 @@
    *  rules associated with the robot name having the smallest index
    *  will be used.
    */
-  public RobotRulesParser(String[] robotNames) {
-    this.robotNames= new HashMap();
-    for (int i= 0; i < robotNames.length; i++) {
-      this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
-    }
-    // always make sure "*" is included
-    if (!this.robotNames.containsKey("*"))
-      this.robotNames.put("*", new Integer(robotNames.length));
+  RobotRulesParser(String[] robotNames) {
+    setRobotNames(robotNames); 
   }
 
   /**
@@ -368,7 +381,7 @@
     return rules;
   }
   
-  public static boolean isAllowed(HttpBase http, URL url)
+  public boolean isAllowed(HttpBase http, URL url)
     throws ProtocolException, IOException {
 
     String host = url.getHost();
@@ -382,8 +395,8 @@
                                              new CrawlDatum(), true);
 
         if (response.getCode() == 200)               // found rules: parse them
-          robotRules = new RobotRulesParser().parseRules(response.getContent());
-        else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
+          robotRules = parseRules(response.getContent());
+        else if ( (response.getCode() == 403) && (!allowForbidden) )
           robotRules = FORBID_ALL_RULES;            // use forbid all
         else                                        
           robotRules = EMPTY_RULES;                 // use default rules

Modified: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java Tue Jan 31 08:08:58 2006
@@ -325,9 +325,10 @@
 
   public static void main( String[] args ) throws Exception {
 
-    Ontology ontology = OntologyFactory.getOntology();
+    NutchConf nutchConf = new NutchConf(); 
+    Ontology ontology = new OntologyFactory(nutchConf).getOntology();
 
-    String urls = NutchConf.get().get("extension.ontology.urls");
+    String urls = nutchConf.get("extension.ontology.urls");
     if (urls==null || urls.trim().equals("")) {
       LOG.severe("No ontology url found.");
       return;

Modified: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java Tue Jan 31 08:08:58 2006
@@ -25,6 +25,7 @@
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
 
 import junit.framework.TestCase;
 
@@ -50,12 +51,14 @@
   private String[] sampleFiles = {"time.owl"};
 
   private static Ontology ontology;
-
+  private NutchConf nutchConf;
   public TestOntology(String name) { 
     super(name); 
   }
 
-  protected void setUp() {}
+  protected void setUp() {
+      this.nutchConf = new NutchConf();
+  }
 
   protected void tearDown() {}
 
@@ -66,7 +69,7 @@
 
     if (ontology==null) {
       try {
-        ontology = OntologyFactory.getOntology();
+        ontology = new OntologyFactory(this.nutchConf).getOntology();
       } catch (Exception e) {
         throw new Exception("Failed to instantiate ontology");
       }