You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/01/31 17:13:17 UTC
svn commit: r373853 [4/6] - in /lucene/nutch/trunk/src:
java/org/apache/nutch/analysis/ java/org/apache/nutch/clustering/
java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/
java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apac...
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -25,12 +25,15 @@
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Phrase;
+import org.apache.nutch.util.NutchConf;
/** Translate query fields to search the same-named field, as indexed by an
* IndexingFilter. Best for tokenized fields. */
public abstract class FieldQueryFilter implements QueryFilter {
private String field;
private float boost = 1.0f;
+ private NutchConf nutchConf;
+ private CommonGrams commonGrams;
/** Construct for the named field.*/
protected FieldQueryFilter(String field) {
@@ -57,12 +60,12 @@
// optimize phrase clause
if (c.isPhrase()) {
- String[] opt = CommonGrams.optimizePhrase(c.getPhrase(), field);
+ String[] opt = this.commonGrams.optimizePhrase(c.getPhrase(), field);
if (opt.length==1) {
c = new Clause(new Query.Term(opt[0]),
- c.isRequired(), c.isProhibited());
+ c.isRequired(), c.isProhibited(), getConf());
} else {
- c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited());
+ c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited(), getConf());
}
}
@@ -88,5 +91,14 @@
// return the modified Lucene query
return output;
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.commonGrams = new CommonGrams(conf);
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Tue Jan 31 08:08:58 2006
@@ -47,38 +47,43 @@
private org.apache.lucene.search.Searcher luceneSearcher;
private org.apache.lucene.index.IndexReader reader;
-
- private LuceneQueryOptimizer optimizer = new LuceneQueryOptimizer
- (NutchConf.get().getInt("searcher.filter.cache.size", 16),
- NutchConf.get().getFloat("searcher.filter.cache.threshold", 0.05f));
+ private LuceneQueryOptimizer optimizer;
+ private NutchFileSystem fs;
+ private NutchConf nutchConf;
+ private QueryFilters queryFilters;
/** Construct given a number of indexes. */
- public IndexSearcher(File[] indexDirs) throws IOException {
+ public IndexSearcher(File[] indexDirs, NutchConf nutchConf) throws IOException {
IndexReader[] readers = new IndexReader[indexDirs.length];
+ this.nutchConf = nutchConf;
+ this.fs = NutchFileSystem.get(nutchConf);
for (int i = 0; i < indexDirs.length; i++) {
readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
}
- init(new MultiReader(readers));
+ init(new MultiReader(readers), nutchConf);
}
/** Construct given a single merged index. */
- public IndexSearcher(File index)
+ public IndexSearcher(File index, NutchConf nutchConf)
throws IOException {
- init(IndexReader.open(getDirectory(index)));
+ this.nutchConf = nutchConf;
+ this.fs = NutchFileSystem.get(nutchConf);
+ init(IndexReader.open(getDirectory(index)), nutchConf);
}
- private void init(IndexReader reader) throws IOException {
+ private void init(IndexReader reader, NutchConf nutchConf) throws IOException {
this.reader = reader;
this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
+ this.optimizer = new LuceneQueryOptimizer(nutchConf);
+ this.queryFilters = new QueryFilters(nutchConf);
}
private Directory getDirectory(File file) throws IOException {
- NutchFileSystem fs = NutchFileSystem.get();
- if ("local".equals(fs.getName())) {
+ if ("local".equals(this.fs.getName())) {
return FSDirectory.getDirectory(file, false);
} else {
- return new NdfsDirectory(fs, file, false);
+ return new NdfsDirectory(this.fs, file, false, this.nutchConf);
}
}
@@ -86,10 +91,8 @@
String dedupField, String sortField, boolean reverse)
throws IOException {
-
org.apache.lucene.search.BooleanQuery luceneQuery =
- QueryFilters.filter(query);
-
+ this.queryFilters.filter(query);
return translateHits
(optimizer.optimize(luceneQuery, luceneSearcher, numHits,
sortField, reverse),
@@ -97,7 +100,7 @@
}
public String getExplanation(Query query, Hit hit) throws IOException {
- return luceneSearcher.explain(QueryFilters.filter(query),
+ return luceneSearcher.explain(this.queryFilters.filter(query),
hit.getIndexDocNo()).toHtml();
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java Tue Jan 31 08:08:58 2006
@@ -11,6 +11,7 @@
import org.apache.nutch.crawl.LinkDbReader;
import org.apache.nutch.fs.NutchFileSystem;
import org.apache.nutch.io.UTF8;
+import org.apache.nutch.util.NutchConf;
import java.io.File;
@@ -18,8 +19,8 @@
private LinkDbReader linkdb = null;
- public LinkDbInlinks(NutchFileSystem fs, File dir) {
- linkdb = new LinkDbReader(fs, dir);
+ public LinkDbInlinks(NutchFileSystem fs, File dir, NutchConf nutchConf) {
+ linkdb = new LinkDbReader(fs, dir, nutchConf);
}
public String[] getAnchors(HitDetails details) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Tue Jan 31 08:08:58 2006
@@ -37,8 +37,6 @@
* which do not affect ranking but might otherwise slow search considerably. */
class LuceneQueryOptimizer {
- private static int MAX_HITS = NutchConf.get().getInt("searcher.max.hits",-1);
-
private static class LimitExceeded extends RuntimeException {
private int maxDoc;
public LimitExceeded(int maxDoc) { this.maxDoc = maxDoc; }
@@ -63,18 +61,28 @@
private float threshold;
- /** Construct an optimizer that caches and uses filters for required clauses
+ private int searcherMaxHits;
+
+ /**
+ * Construct an optimizer that caches and uses filters for required clauses
* whose boost is zero.
- * @param cacheSize the number of QueryFilters to cache
- * @param threshold the fraction of documents which must contain a term
+ *
+ * @param cacheSize
+ * the number of QueryFilters to cache
+ * @param threshold
+ * the fraction of documents which must contain a term
*/
- public LuceneQueryOptimizer(final int cacheSize, float threshold) {
+ public LuceneQueryOptimizer(NutchConf nutchConf) {
+ final int cacheSize = nutchConf.getInt("searcher.filter.cache.size", 16);
+ this.threshold = nutchConf.getFloat("searcher.filter.cache.threshold",
+ 0.05f);
+ this.searcherMaxHits = nutchConf.getInt("searcher.max.hits", -1);
+ this.searcherMaxHits = searcherMaxHits;
this.cache = new LinkedHashMap(cacheSize, 0.75f, true) {
- protected boolean removeEldestEntry(Map.Entry eldest) {
- return size() > cacheSize; // limit size of cache
- }
- };
- this.threshold = threshold;
+ protected boolean removeEldestEntry(Map.Entry eldest) {
+ return size() > cacheSize; // limit size of cache
+ }
+ };
}
public TopDocs optimize(BooleanQuery original,
@@ -123,7 +131,6 @@
}
Filter filter = null;
-
if (cacheQuery.getClauses().length != 0) {
synchronized (cache) { // check cache
filter = (Filter)cache.get(cacheQuery);
@@ -151,12 +158,12 @@
if (sortField == null && !reverse) {
// no hit limit
- if (MAX_HITS <= 0) {
+ if (this.searcherMaxHits <= 0) {
return searcher.search(query, filter, numHits);
}
// hits limited -- use a LimitedCollector
- LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
+ LimitedCollector collector = new LimitedCollector(numHits, searcherMaxHits);
LimitExceeded exceeded = null;
try {
searcher.search(query, filter, collector);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Tue Jan 31 08:08:58 2006
@@ -42,8 +42,6 @@
LogFormatter.setShowThreadIDs(true);
}
- private NutchFileSystem fs = NutchFileSystem.get();
-
private String[] segmentNames;
private Searcher searcher;
@@ -52,50 +50,65 @@
private HitContent content;
private HitInlinks linkDb;
- private float RAW_HITS_FACTOR =
- NutchConf.get().getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
/** BooleanQuery won't permit more than 32 required/prohibited clauses. We
* don't want to use too many of those. */
private static final int MAX_PROHIBITED_TERMS = 20;
+
+ private NutchConf nutchConf;
+
+ private NutchFileSystem fs;
/** Cache in servlet context. */
- public static NutchBean get(ServletContext app) throws IOException {
+ public static NutchBean get(ServletContext app, NutchConf conf) throws IOException {
NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
if (bean == null) {
LOG.info("creating new bean");
- bean = new NutchBean();
+ bean = new NutchBean(conf);
app.setAttribute("nutchBean", bean);
}
return bean;
}
- /** Construct reading from connected directory. */
- public NutchBean() throws IOException {
- this(new File(NutchConf.get().get("searcher.dir", "crawl")));
- }
- /** Construct in a named directory. */
- public NutchBean(File dir) throws IOException {
- File servers = new File(dir, "search-servers.txt");
- if (fs.exists(servers)) {
- LOG.info("searching servers in " + servers.getCanonicalPath());
- init(new DistributedSearch.Client(servers));
- } else {
- init(new File(dir, "index"),
- new File(dir, "indexes"),
- new File(dir, "segments"),
- new File(dir, "linkdb"));
- }
+ /**
+ *
+ * @param nutchConf
+ * @throws IOException
+ */
+ public NutchBean(NutchConf nutchConf) throws IOException {
+ this(nutchConf, null);
}
+
+ /**
+ * Construct in a named directory.
+ * @param nutchConf
+ * @param dir
+ * @throws IOException
+ */
+ public NutchBean(NutchConf nutchConf, File dir) throws IOException {
+ this.nutchConf = nutchConf;
+ this.fs = NutchFileSystem.get(this.nutchConf);
+ if (dir == null) {
+ dir = new File(this.nutchConf.get("searcher.dir", "crawl"));
+ }
+ File servers = new File(dir, "search-servers.txt");
+ if (fs.exists(servers)) {
+ LOG.info("searching servers in " + servers.getCanonicalPath());
+ init(new DistributedSearch.Client(servers, nutchConf));
+ } else {
+ init(new File(dir, "index"), new File(dir, "indexes"), new File(
+ dir, "segments"), new File(dir, "linkdb"));
+ }
+ }
private void init(File indexDir, File indexesDir, File segmentsDir,
File linkDb)
throws IOException {
IndexSearcher indexSearcher;
- if (fs.exists(indexDir)) {
+ if (this.fs.exists(indexDir)) {
LOG.info("opening merged index in " + indexDir);
- indexSearcher = new IndexSearcher(indexDir);
+ indexSearcher = new IndexSearcher(indexDir, this.nutchConf);
} else {
LOG.info("opening indexes in " + indexesDir);
@@ -108,16 +121,17 @@
}
}
+
directories = new File[ vDirs.size() ];
for(int i = 0; vDirs.size()>0; i++) {
directories[i]=(File)vDirs.remove(0);
}
- indexSearcher = new IndexSearcher(directories);
+ indexSearcher = new IndexSearcher(directories, this.nutchConf);
}
LOG.info("opening segments in " + segmentsDir);
- FetchedSegments segments = new FetchedSegments(fs, segmentsDir.toString());
+ FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.nutchConf);
this.segmentNames = segments.getSegmentNames();
@@ -127,7 +141,7 @@
this.content = segments;
LOG.info("opening linkdb in " + linkDb);
- this.linkDb = new LinkDbInlinks(fs, linkDb);
+ this.linkDb = new LinkDbInlinks(fs, linkDb, this.nutchConf);
}
private void init(DistributedSearch.Client client) {
@@ -216,7 +230,8 @@
if (maxHitsPerDup <= 0) // disable dup checking
return search(query, numHits, dedupField, sortField, reverse);
- int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR);
+ float rawHitsFactor = this.nutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
+ int numHitsRaw = (int)(numHits * rawHitsFactor);
LOG.info("searching for "+numHitsRaw+" raw hits");
Hits hits = searcher.search(query, numHitsRaw,
dedupField, sortField, reverse);
@@ -237,7 +252,7 @@
optQuery.addProhibitedTerm(((String)excludedValues.get(i)),
dedupField);
}
- numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR);
+ numHitsRaw = (int)(numHitsRaw * rawHitsFactor);
LOG.info("re-searching for "+numHitsRaw+" raw hits, query: "+optQuery);
hits = searcher.search(optQuery, numHitsRaw,
dedupField, sortField, reverse);
@@ -344,9 +359,9 @@
System.exit(-1);
}
- NutchBean bean = new NutchBean();
- Query query = Query.parse(args[0]);
-
+ NutchConf nutchConf = new NutchConf();
+ NutchBean bean = new NutchBean(nutchConf);
+ Query query = Query.parse(args[0], nutchConf);
Hits hits = bean.search(query, 10);
System.out.println("Total hits: " + hits.getTotal());
int length = (int)Math.min(hits.getTotal(), 10);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Tue Jan 31 08:08:58 2006
@@ -31,6 +31,8 @@
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.*;
+
+import org.apache.nutch.util.NutchConf;
import org.w3c.dom.*;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.Transformer;
@@ -55,10 +57,12 @@
}
private NutchBean bean;
+ private NutchConf nutchConf;
- public void init(ServletConfig config) throws ServletException {
+ public void init(ServletConfig config, NutchConf nutchConf) throws ServletException {
try {
- bean = NutchBean.get(config.getServletContext());
+ bean = NutchBean.get(config.getServletContext(), nutchConf);
+ this.nutchConf = nutchConf;
} catch (IOException e) {
throw new ServletException(e);
}
@@ -114,7 +118,7 @@
(sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
(dedupField == null ? "" : "&dedupField=" + dedupField));
- Query query = Query.parse(queryString);
+ Query query = Query.parse(queryString, this.nutchConf);
NutchBean.LOG.info("query: " + queryString);
// execute the query
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.analysis.NutchAnalysis;
import org.apache.nutch.io.Writable;
@@ -47,30 +48,34 @@
private boolean isProhibited;
private String field = DEFAULT_FIELD;
private float weight = 1.0f;
- private Object termOrPhrase;
+ private Object termOrPhrase;
+
+ private NutchConf nutchConf;
public Clause(Term term, String field,
- boolean isRequired, boolean isProhibited) {
- this(term, isRequired, isProhibited);
+ boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+ this(term, isRequired, isProhibited, nutchConf);
this.field = field;
}
- public Clause(Term term, boolean isRequired, boolean isProhibited) {
+ public Clause(Term term, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
this.isRequired = isRequired;
this.isProhibited = isProhibited;
this.termOrPhrase = term;
+ this.nutchConf = nutchConf;
}
public Clause(Phrase phrase, String field,
- boolean isRequired, boolean isProhibited) {
- this(phrase, isRequired, isProhibited);
+ boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+ this(phrase, isRequired, isProhibited, nutchConf);
this.field = field;
}
- public Clause(Phrase phrase, boolean isRequired, boolean isProhibited) {
+ public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
this.isRequired = isRequired;
this.isProhibited = isProhibited;
this.termOrPhrase = phrase;
+ this.nutchConf = nutchConf;
}
public boolean isRequired() { return isRequired; }
@@ -104,7 +109,7 @@
getTerm().write(out);
}
- public static Clause read(DataInput in) throws IOException {
+ public static Clause read(DataInput in, NutchConf nutchConf) throws IOException {
byte bits = in.readByte();
boolean required = ((bits & REQUIRED_BIT) != 0);
boolean prohibited = ((bits & PROHIBITED_BIT) != 0);
@@ -114,9 +119,9 @@
Clause clause;
if ((bits & PHRASE_BIT) == 0) {
- clause = new Clause(Term.read(in), field, required, prohibited);
+ clause = new Clause(Term.read(in), field, required, prohibited, nutchConf);
} else {
- clause = new Clause(Phrase.read(in), field, required, prohibited);
+ clause = new Clause(Phrase.read(in), field, required, prohibited, nutchConf);
}
clause.weight = weight;
return clause;
@@ -135,7 +140,7 @@
buffer.append(":");
}
- if (!isPhrase() && QueryFilters.isRawField(field)) {
+ if (!isPhrase() && new QueryFilters(nutchConf).isRawField(field)) {
buffer.append('"'); // quote raw terms
buffer.append(termOrPhrase.toString());
buffer.append('"');
@@ -274,7 +279,13 @@
private ArrayList clauses = new ArrayList();
+ private NutchConf nutchConf;
+
private static final Clause[] CLAUSES_PROTO = new Clause[0];
+
+ public Query(NutchConf nutchConf) {
+ this.nutchConf = nutchConf;
+ }
/** Return all clauses. */
public Clause[] getClauses() {
@@ -288,7 +299,7 @@
/** Add a required term in a specified field. */
public void addRequiredTerm(String term, String field) {
- clauses.add(new Clause(new Term(term), field, true, false));
+ clauses.add(new Clause(new Term(term), field, true, false, this.nutchConf));
}
/** Add a prohibited term in the default field. */
@@ -298,7 +309,7 @@
/** Add a prohibited term in the specified field. */
public void addProhibitedTerm(String term, String field) {
- clauses.add(new Clause(new Term(term), field, false, true));
+ clauses.add(new Clause(new Term(term), field, false, true, this.nutchConf));
}
/** Add a required phrase in the default field. */
@@ -312,7 +323,7 @@
} else if (terms.length == 1) {
addRequiredTerm(terms[0], field); // optimize to term query
} else {
- clauses.add(new Clause(new Phrase(terms), field, true, false));
+ clauses.add(new Clause(new Phrase(terms), field, true, false, this.nutchConf));
}
}
@@ -327,7 +338,7 @@
} else if (terms.length == 1) {
addProhibitedTerm(terms[0], field); // optimize to term query
} else {
- clauses.add(new Clause(new Phrase(terms), field, false, true));
+ clauses.add(new Clause(new Phrase(terms), field, false, true, this.nutchConf));
}
}
@@ -337,8 +348,8 @@
((Clause)clauses.get(i)).write(out);
}
- public static Query read(DataInput in) throws IOException {
- Query result = new Query();
+ public static Query read(DataInput in, NutchConf nutchConf) throws IOException {
+ Query result = new Query(nutchConf);
result.readFields(in);
return result;
}
@@ -347,7 +358,7 @@
clauses.clear();
int length = in.readByte();
for (int i = 0; i < length; i++)
- clauses.add(Clause.read(in));
+ clauses.add(Clause.read(in, this.nutchConf));
}
public String toString() {
@@ -404,18 +415,18 @@
/** Parse a query from a string. */
- public static Query parse(String queryString) throws IOException {
- return fixup(NutchAnalysis.parseQuery(queryString));
+ public static Query parse(String queryString, NutchConf nutchConf) throws IOException {
+ return fixup(NutchAnalysis.parseQuery(queryString, nutchConf), nutchConf);
}
/** Convert clauses in unknown fields to the default field. */
- private static Query fixup(Query input) {
+ private static Query fixup(Query input, NutchConf nutchConf) {
// walk the query
- Query output = new Query();
+ Query output = new Query(nutchConf);
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
- if (!QueryFilters.isField(c.getField())) { // unknown field
+ if (!new QueryFilters(nutchConf).isField(c.getField())) { // unknown field
ArrayList terms = new ArrayList(); // add name to query
if (c.isPhrase()) {
terms.addAll(Arrays.asList(c.getPhrase().getTerms()));
@@ -436,12 +447,13 @@
/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ NutchConf nutchConf = new NutchConf();
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- Query query = parse(line);
+ Query query = parse(line, nutchConf);
System.out.println("Parsed: " + query);
- System.out.println("Translated: " + QueryFilters.filter(query));
+ System.out.println("Translated: " + new QueryFilters(nutchConf).filter(query));
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,12 +17,13 @@
package org.apache.nutch.searcher;
import org.apache.lucene.search.BooleanQuery;
+import org.apache.nutch.util.NutchConfigurable;
/** Extension point for query translation. Permits one to add metadata to a
* query. All plugins found which implement this extension point are run
* sequentially on the query.
*/
-public interface QueryFilter {
+public interface QueryFilter extends NutchConfigurable {
/** The name of the extension point. */
final static String X_POINT_ID = QueryFilter.class.getName();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java Tue Jan 31 08:08:58 2006
@@ -19,6 +19,8 @@
import org.apache.nutch.plugin.*;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
import java.util.logging.Logger;
import java.util.*;
@@ -35,35 +37,9 @@
private static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.searcher.QueryFilters");
- private static final QueryFilter[] CACHE;
- private static final HashSet FIELD_NAMES = new HashSet();
- private static final HashSet RAW_FIELD_NAMES = new HashSet();
-
- static {
- try {
- ExtensionPoint point = PluginRepository.getInstance()
- .getExtensionPoint(QueryFilter.X_POINT_ID);
- if (point == null)
- throw new RuntimeException(QueryFilter.X_POINT_ID+" not found.");
- Extension[] extensions = point.getExtensions();
- CACHE = new QueryFilter[extensions.length];
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- ArrayList fieldNames = parseFieldNames(extension, "fields");
- ArrayList rawFieldNames = parseFieldNames(extension, "raw-fields");
- if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
- LOG.warning("QueryFilter: "+extension.getId()+" names no fields.");
- continue;
- }
- CACHE[i] = (QueryFilter)extension.getExtensionInstance();
- FIELD_NAMES.addAll(fieldNames);
- FIELD_NAMES.addAll(rawFieldNames);
- RAW_FIELD_NAMES.addAll(rawFieldNames);
- }
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- }
+ private QueryFilter[] queryFilters;
+ private HashSet FIELD_NAMES ;
+ private HashSet RAW_FIELD_NAMES;
private static ArrayList parseFieldNames(Extension extension,
String attribute) {
@@ -72,10 +48,50 @@
return Collections.list(new StringTokenizer(fields, " ,\t\n\r"));
}
- private QueryFilters() {} // no public ctor
+ public QueryFilters(NutchConf nutchConf) {
+ this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+ .getName());
+ if (this.queryFilters == null) {
+ try {
+ ExtensionPoint point = nutchConf.getPluginRepository()
+ .getExtensionPoint(QueryFilter.X_POINT_ID);
+ if (point == null)
+ throw new RuntimeException(QueryFilter.X_POINT_ID + " not found.");
+ Extension[] extensions = point.getExtensions();
+ QueryFilter[] filters = new QueryFilter[extensions.length];
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ ArrayList fieldNames = parseFieldNames(extension, "fields");
+ ArrayList rawFieldNames = parseFieldNames(extension, "raw-fields");
+ if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
+ LOG.warning("QueryFilter: " + extension.getId()
+ + " names no fields.");
+ continue;
+ }
+ filters[i] = (QueryFilter) extension.getExtensionInstance();
+ FIELD_NAMES = new HashSet();
+ FIELD_NAMES.addAll(fieldNames);
+ FIELD_NAMES.addAll(rawFieldNames);
+ nutchConf.setObject("FIELD_NAMES", FIELD_NAMES);
+ RAW_FIELD_NAMES = new HashSet();
+ RAW_FIELD_NAMES.addAll(rawFieldNames);
+ nutchConf.setObject("RAW_FIELD_NAMES", RAW_FIELD_NAMES);
+ }
+ nutchConf.setObject(QueryFilter.class.getName(), filters);
+ } catch (PluginRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+ this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+ .getName());
+ } else {
+ // cache already filled
+ FIELD_NAMES = (HashSet) nutchConf.getObject("FIELD_NAMES");
+ RAW_FIELD_NAMES = (HashSet) nutchConf.getObject("RAW_FIELD_NAMES");
+ }
+ }
/** Run all defined filters. */
- public static BooleanQuery filter(Query input) throws QueryException {
+ public BooleanQuery filter(Query input) throws QueryException {
// first check that all field names are claimed by some plugin
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
@@ -86,16 +102,17 @@
// then run each plugin
BooleanQuery output = new BooleanQuery();
- for (int i = 0 ; i < CACHE.length; i++) {
- output = CACHE[i].filter(input, output);
+ for (int i = 0; i < this.queryFilters.length; i++) {
+ output = this.queryFilters[i].filter(input, output);
}
return output;
}
- public static boolean isField(String name) {
+ public boolean isField(String name) {
return FIELD_NAMES.contains(name);
}
- public static boolean isRawField(String name) {
+
+ public boolean isRawField(String name) {
return RAW_FIELD_NAMES.contains(name);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java Tue Jan 31 08:08:58 2006
@@ -29,21 +29,23 @@
/** Implements hit summarization. */
public class Summarizer {
-
- /** The number of context terms to display preceding and following matches.*/
- private static final int SUM_CONTEXT =
- NutchConf.get().getInt("searcher.summary.context", 5);
-
- /** The total number of terms to display in a summary.*/
- private static final int SUM_LENGTH =
- NutchConf.get().getInt("searcher.summary.length", 20);
-
+
/** Converts text to tokens. */
- private static final Analyzer ANALYZER = new NutchDocumentAnalyzer();
+ private Analyzer ANALYZER;
+ private NutchConf nutchConf;
/**
- * Class Excerpt represents a single passage found in the
- * document, with some appropriate regions highlit.
+ * The constructor.
+ * @param conf
+ */
+ public Summarizer(NutchConf conf) {
+ this.nutchConf = conf;
+ this.ANALYZER = new NutchDocumentAnalyzer(conf);
+ }
+
+ /**
+ * Class Excerpt represents a single passage found in the document, with some
+ * appropriate regions highlit.
*/
class Excerpt {
Vector passages = new Vector();
@@ -54,7 +56,7 @@
*/
public Excerpt() {
}
-
+
/**
*/
public void addToken(String token) {
@@ -99,7 +101,7 @@
}
/** Returns a summary for the given pre-tokenized text. */
- public Summary getSummary(String text, Query query) throws IOException {
+ public Summary getSummary(String text, Query query, int sumContext, int sumLength) throws IOException {
// Simplistic implementation. Finds the first fragments in the document
// containing any query terms.
@@ -161,8 +163,8 @@
// Start searching at a point SUM_CONTEXT terms back,
// and move SUM_CONTEXT terms into the future.
//
- int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0;
- int endToken = Math.min(i+SUM_CONTEXT, tokens.length);
+ int startToken = (i > sumContext) ? i - sumContext : 0;
+ int endToken = Math.min(i + sumContext, tokens.length);
int offset = tokens[startToken].startOffset();
int j = startToken;
@@ -181,7 +183,7 @@
// the document and we haven't hit the max-number-of-items
// -in-a-summary.
//
- while ((j < endToken) && (j - startToken < SUM_LENGTH)) {
+ while ((j < endToken) && (j - startToken < sumLength)) {
//
// Now grab the hit-element, if present
//
@@ -191,7 +193,7 @@
excerpt.add(new Fragment(text.substring(offset, t.startOffset())));
excerpt.add(new Highlight(text.substring(t.startOffset(),t.endOffset())));
offset = t.endOffset();
- endToken = Math.min(j+SUM_CONTEXT, tokens.length);
+ endToken = Math.min(j + sumContext, tokens.length);
}
j++;
@@ -226,7 +228,7 @@
// Start SUM_CONTEXT places away. The next
// search for relevant excerpts begins at i-SUM_CONTEXT
//
- i = j+SUM_CONTEXT;
+ i = j + sumContext;
}
}
@@ -236,7 +238,7 @@
//
if (excerptSet.size() == 0) {
Excerpt excerpt = new Excerpt();
- int excerptLen = Math.min(SUM_LENGTH, tokens.length);
+ int excerptLen = Math.min(sumLength, tokens.length);
lastExcerptPos = excerptLen;
excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset())));
@@ -250,7 +252,7 @@
//
double tokenCount = 0;
Summary s = new Summary();
- while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) {
+ while (tokenCount <= sumLength && excerptSet.size() > 0) {
Excerpt excerpt = (Excerpt) excerptSet.last();
excerptSet.remove(excerpt);
@@ -258,7 +260,7 @@
for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) {
Fragment f = (Fragment) e.nextElement();
// Don't add fragments if it takes us over the max-limit
- if (tokenCount + tokenFraction <= SUM_LENGTH) {
+ if (tokenCount + tokenFraction <= sumLength) {
s.add(f);
}
tokenCount += tokenFraction;
@@ -290,7 +292,7 @@
return;
}
- Summarizer s = new Summarizer();
+ Summarizer s = new Summarizer(new NutchConf());
//
// Parse the args
@@ -318,8 +320,11 @@
in.close();
}
+ NutchConf nutchConf = new NutchConf();
+ int sumContext = nutchConf.getInt("searcher.summary.context", 5);
+ int sumLength = nutchConf.getInt("searcher.summary.length", 20);
// Convert the query string into a proper Query
- Query query = Query.parse(queryBuf.toString());
- System.out.println("Summary: '" + s.getSummary(body.toString(), query) + "'");
+ Query query = Query.parse(queryBuf.toString(), nutchConf);
+ System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'");
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Tue Jan 31 08:08:58 2006
@@ -68,7 +68,7 @@
throws IOException {
reporter.setStatus(split.toString());
- return new SequenceFileRecordReader(fs, split) {
+ return new SequenceFileRecordReader(job, split) {
public synchronized boolean next(Writable key, Writable value)
throws IOException {
ObjectWritable wrapper = (ObjectWritable)value;
@@ -219,7 +219,8 @@
}
public static void main(String[] args) throws Exception {
- SegmentReader segmentReader = new SegmentReader(NutchConf.get());
+ NutchConf nutchConf = new NutchConf();
+ SegmentReader segmentReader = new SegmentReader(nutchConf);
String usage = "Usage: SegmentReader <segment>";
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Tue Jan 31 08:08:58 2006
@@ -47,9 +47,9 @@
NutchBean bean = null;
- public void init() {
+ public void init(NutchConf nutchConf) {
try {
- bean = NutchBean.get(this.getServletContext());
+ bean = NutchBean.get(this.getServletContext(), nutchConf);
} catch (IOException e) {
// nothing
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Tue Jan 31 08:08:58 2006
@@ -338,7 +338,8 @@
Pattern topicPattern = null;
Vector topics = new Vector();
- NutchFileSystem nfs = NutchFileSystem.get();
+ NutchConf nutchConf = new NutchConf();
+ NutchFileSystem nfs = NutchFileSystem.get(nutchConf);
try {
for (int i = 1; i < argv.length; i++) {
if ("-includeAdultMaterial".equals(argv[i])) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java Tue Jan 31 08:08:58 2006
@@ -459,8 +459,9 @@
if (qPath != null) {
is = new FileInputStream(qPath);
} else {
- qPath = NutchConf.get().get("prune.index.tool.queries");
- is = NutchConf.get().getConfResourceAsInputStream(qPath);
+ NutchConf nutchConf = new NutchConf();
+ qPath = nutchConf.get("prune.index.tool.queries");
+ is = nutchConf.getConfResourceAsInputStream(qPath);
}
if (is == null) {
LOG.severe("Can't load queries from " + qPath);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java Tue Jan 31 08:08:58 2006
@@ -18,11 +18,12 @@
import java.util.*;
import java.net.URL;
-import java.net.URLClassLoader;
-import java.net.MalformedURLException;
import java.io.*;
import java.util.logging.Logger;
+
import javax.xml.parsers.*;
+
+import org.apache.nutch.plugin.PluginRepository;
import org.w3c.dom.*;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.Transformer;
@@ -45,16 +46,13 @@
private static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.util.NutchConf");
- private static final NutchConf DEFAULT = new NutchConf();
-
- /** Return the default configuration. */
- public static NutchConf get() { return DEFAULT; }
-
private ArrayList resourceNames = new ArrayList();
private Properties properties;
private ClassLoader classLoader =
Thread.currentThread().getContextClassLoader();
+ private PluginRepository pluginRepository;
+
/** A new configuration. */
public NutchConf() {
resourceNames.add("nutch-default.xml");
@@ -89,9 +87,21 @@
resourceNames.add(resourceNames.size()-1, name); // add second to last
properties = null; // trigger reload
}
+
+ /**
+ * @return a cached instance of the plugin repository
+ */
+ public PluginRepository getPluginRepository() {
+ if (this.pluginRepository == null) {
+ this.pluginRepository = new PluginRepository(this);
+ }
+ return this.pluginRepository;
+ }
- /** Returns the value of the <code>name</code> property, or null if no
- * such property exists. */
+ /**
+ * Returns the value of the <code>name</code> property, or null if no such
+ * property exists.
+ */
public Object getObject(String name) { return getProps().get(name);}
/** Sets the value of the <code>name</code> property. */
@@ -390,8 +400,13 @@
conf.appendChild(doc.createTextNode("\n"));
for (Enumeration e = properties.keys(); e.hasMoreElements();) {
String name = (String)e.nextElement();
- String value = (String)properties.get(name);
-
+ Object object = properties.get(name);
+ String value = null;
+ if(object instanceof String) {
+ value = (String) object;
+ }else {
+ continue;
+ }
Element propNode = doc.createElement("property");
conf.appendChild(propNode);
@@ -437,7 +452,7 @@
/** For debugging. List non-default properties to the terminal and exit. */
public static void main(String[] args) throws Exception {
- get().write(System.out);
+ new NutchConf().write(System.out);
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -30,6 +30,7 @@
import java.util.logging.Logger;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import java.util.*;
import java.net.URL;
@@ -43,6 +44,8 @@
/** The name of the document field we use. */
public static String FIELD = "cc";
+ private NutchConf nutchConf;
+
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -98,6 +101,14 @@
private void addFeature(Document doc, String feature) {
doc.add(Field.Keyword(FIELD, feature));
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Tue Jan 31 08:08:58 2006
@@ -36,8 +36,6 @@
public static final Logger LOG
= LogFormatter.getLogger(CCParseFilter.class.getName());
- private static final boolean EXCLUDE_UNLICENSED =
- NutchConf.get().getBoolean("creativecommons.exclude.unlicensed", false);
/** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
public static class Walker {
@@ -52,7 +50,7 @@
}
/** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, ContentProperties metadata)
+ public static void walk(Node doc, URL base, ContentProperties metadata, NutchConf nutchConf)
throws ParseException {
// walk the DOM tree, scanning for license data
@@ -71,7 +69,7 @@
} else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
licenseLocation = "a";
licenseUrl = walker.anchorLicense.toString();
- } else if (EXCLUDE_UNLICENSED) {
+ } else if (nutchConf.getBoolean("creativecommons.exclude.unlicensed", false)) {
throw new ParseException("No CC license. Excluding.");
}
@@ -251,6 +249,8 @@
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
}
+ private NutchConf nutchConf;
+
/** Adds metadata or otherwise modifies a parse of an HTML document, given
* the DOM tree of a page. */
public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -260,17 +260,24 @@
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
try {
// extract license metadata
- Walker.walk(doc, base, parse.getData().getMetadata());
+ Walker.walk(doc, base, parse.getData().getMetadata(), getConf());
} catch (ParseException e) {
- return new ParseStatus(e).getEmptyParse();
+ return new ParseStatus(e).getEmptyParse(getConf());
}
return parse;
}
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,11 +17,24 @@
package org.creativecommons.nutch;
import org.apache.nutch.searcher.RawFieldQueryFilter;
+import org.apache.nutch.util.NutchConf;
-/** Handles "cc:" query clauses, causing them to search the "cc" field
- * indexed by CCIndexingFilter. */
+/**
+ * Handles "cc:" query clauses, causing them to search the "cc" field indexed by
+ * CCIndexingFilter.
+ */
public class CCQueryFilter extends RawFieldQueryFilter {
+ private NutchConf nutchConf;
+
public CCQueryFilter() {
super(CCIndexingFilter.FIELD);
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Jan 31 08:08:58 2006
@@ -1,68 +1,70 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ContentProperties;
-
-import java.util.Properties;
-import java.io.*;
-import java.net.URL;
-
-import junit.framework.TestCase;
-
-public class TestCCParseFilter extends TestCase {
-
- private static final File testDir =
- new File(System.getProperty("test.input"));
-
- public void testPages() throws Exception {
- pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
- pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
- pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
- }
-
- public void pageTest(File file, String url,
- String license, String location, String type)
- throws Exception {
-
- String contentType = "text/html";
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
-
- Content content =
- new Content(url, url, bytes, contentType, new ContentProperties());
- Parse parse = ParseUtil.parseByParserId("parse-html",content);
-
- ContentProperties metadata = parse.getData().getMetadata();
- assertEquals(license, metadata.get("License-Url"));
- assertEquals(location, metadata.get("License-Location"));
- assertEquals(type, metadata.get("Work-Type"));
- }
-}
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
+
+import java.util.Properties;
+import java.io.*;
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+public class TestCCParseFilter extends TestCase {
+
+ private static final File testDir =
+ new File(System.getProperty("test.input"));
+
+ public void testPages() throws Exception {
+ pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+ pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+ pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+ }
+
+ public void pageTest(File file, String url,
+ String license, String location, String type)
+ throws Exception {
+
+ String contentType = "text/html";
+ InputStream in = new FileInputStream(file);
+ ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
+ byte[] buffer = new byte[1024];
+ int i;
+ while ((i = in.read(buffer)) != -1) {
+ out.write(buffer, 0, i);
+ }
+ in.close();
+ byte[] bytes = out.toByteArray();
+ NutchConf nutchConf = new NutchConf();
+
+ Content content =
+ new Content(url, url, bytes, contentType, new ContentProperties(), nutchConf);
+ Parse parse = new ParseUtil(nutchConf).parseByParserId("parse-html",content);
+
+ ContentProperties metadata = parse.getData().getMetadata();
+ assertEquals(license, metadata.get("License-Url"));
+ assertEquals(location, metadata.get("License-Location"));
+ assertEquals(type, metadata.get("Work-Type"));
+ }
+}
Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -40,8 +40,8 @@
public static final Logger LOG
= LogFormatter.getLogger(BasicIndexingFilter.class.getName());
- private static final int MAX_TITLE_LENGTH =
- NutchConf.get().getInt("indexer.max.title.length", 100);
+ private int MAX_TITLE_LENGTH;
+ private NutchConf nutchConf;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -87,6 +87,15 @@
doc.add(Field.Text("title", title));
return doc;
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -75,13 +75,10 @@
= LogFormatter.getLogger(MoreIndexingFilter.class.getName());
/** A flag that tells if magic resolution must be performed */
- private final static boolean MAGIC =
- NutchConf.get().getBoolean("mime.type.magic", true);
+ private boolean MAGIC;
/** Get the MimeTypes resolver instance. */
- private final static MimeTypes MIME =
- MimeTypes.get(NutchConf.get().get("mime.types.file"));
-
+ private MimeTypes MIME;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -247,6 +244,8 @@
// HTTP header "Content-Disposition". Typically it looks like:
// Content-Disposition: inline; filename="foo.ppt"
private PatternMatcher matcher = new Perl5Matcher();
+
+ private NutchConf nutchConf;
static Perl5Pattern patterns[] = {null, null};
static {
Perl5Compiler compiler = new Perl5Compiler();
@@ -300,6 +299,16 @@
}
return normalized;
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ MAGIC = conf.getBoolean("mime.type.magic", true);
+ MIME = MimeTypes.get(getConf().get("mime.types.file"));
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Tue Jan 31 08:08:58 2006
@@ -28,6 +28,7 @@
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
// DOM imports
import org.w3c.dom.DocumentFragment;
@@ -67,6 +68,8 @@
LOG.severe(e.toString());
}
}
+
+ private NutchConf nutchConf;
@@ -195,5 +198,11 @@
}
-
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Tue Jan 31 08:08:58 2006
@@ -95,12 +95,12 @@
/**
* Constructs a new Language Identifier.
*/
- private LanguageIdentifier() {
+ public LanguageIdentifier(NutchConf nutchConf) {
// Gets ngram sizes to take into account from the Nutch Config
- minLength = NutchConf.get().getInt("lang.ngram.min.length",
+ minLength = nutchConf.getInt("lang.ngram.min.length",
NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
- maxLength = NutchConf.get().getInt("lang.ngram.max.length",
+ maxLength = nutchConf.getInt("lang.ngram.max.length",
NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
// Ensure the min and max values are in an acceptale range
// (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
@@ -110,7 +110,7 @@
minLength = Math.min(minLength, maxLength);
// Gets the value of the maximum size of data to analyze
- analyzeLength = NutchConf.get().getInt("lang.analyze.max.length",
+ analyzeLength = nutchConf.getInt("lang.analyze.max.length",
DEFAULT_ANALYSIS_LENGTH);
Properties p = new Properties();
@@ -174,20 +174,6 @@
}
}
- /**
- * Get a LanguageIdentifier instance.
- * @return the LanguageIdentifier singleton instance.
- */
- public static LanguageIdentifier getInstance() {
- if (identifier == null) {
- synchronized(LanguageIdentifier.class) {
- if (identifier == null) {
- identifier = new LanguageIdentifier();
- }
- }
- }
- return identifier;
- }
/**
* Main method used for command line process.
@@ -272,9 +258,10 @@
}
+ NutchConf nutchConf = new NutchConf();
String lang = null;
//LanguageIdentifier idfr = LanguageIdentifier.getInstance();
- LanguageIdentifier idfr = new LanguageIdentifier();
+ LanguageIdentifier idfr = new LanguageIdentifier(nutchConf);
File f;
FileInputStream fis;
try {
@@ -292,7 +279,7 @@
break;
case IDURL:
- text = getUrlContent(filename);
+ text = getUrlContent(filename, nutchConf);
lang = idfr.identify(text);
break;
@@ -348,13 +335,13 @@
* @param url
* @return contents of url
*/
- private static String getUrlContent(String url) {
+ private static String getUrlContent(String url, NutchConf nutchConf) {
Protocol protocol;
try {
- protocol = ProtocolFactory.getProtocol(url);
+ protocol = new ProtocolFactory(nutchConf).getProtocol(url);
Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
String contentType = content.getContentType();
- Parser parser = ParserFactory.getParser(contentType, url);
+ Parser parser = new ParserFactory(nutchConf).getParser(contentType, url);
Parse parse = parser.getParse(content);
System.out.println("text:" + parse.getText());
return parse.getText();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConf;
// Lucene imports
import org.apache.lucene.document.Field;
@@ -49,7 +50,10 @@
public class LanguageIndexingFilter implements IndexingFilter {
- /**
+ private NutchConf nutchConf;
+ private LanguageIdentifier languageIdentifier;
+
+/**
* Constructs a new Language Indexing Filter.
*/
public LanguageIndexingFilter() {
@@ -77,7 +81,7 @@
text.append(parse.getData().getTitle())
.append(" ")
.append(parse.getText());
- lang = LanguageIdentifier.getInstance().identify(text);
+ lang = this.languageIdentifier.identify(text);
}
if (lang == null) {
@@ -88,5 +92,13 @@
return doc;
}
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ this.languageIdentifier = new LanguageIdentifier(conf);
+ }
+ public NutchConf getConf() {
+ return this.nutchConf;
+ }
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java Tue Jan 31 08:08:58 2006
@@ -17,11 +17,22 @@
package org.apache.nutch.analysis.lang;
import org.apache.nutch.searcher.RawFieldQueryFilter;
+import org.apache.nutch.util.NutchConf;
/** Handles "lang:" query clauses, causing them to search the "lang" field
* indexed by LanguageIdentifier. */
public class LanguageQueryFilter extends RawFieldQueryFilter {
+ private NutchConf nutchConf;
+
public LanguageQueryFilter() {
super("lang");
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
+import org.apache.nutch.util.NutchConf;
public class TestHTMLLanguageParser extends TestCase {
@@ -52,7 +53,7 @@
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
- Parser parser = ParserFactory.getParser("text/html", URL);
+ Parser parser = new ParserFactory(new NutchConf()).getParser("text/html", URL);
Parse parse = parser.getParse(content);
assertEquals(metalanguages[t], (String) parse.getData().get(
@@ -125,7 +126,7 @@
ContentProperties p = new ContentProperties();
p.put("Content-Type", "text/html");
- Content content = new Content(URL, BASE, text.getBytes(), "text/html", p);
+ Content content = new Content(URL, BASE, text.getBytes(), "text/html", p, new NutchConf());
return content;
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Tue Jan 31 08:08:58 2006
@@ -32,6 +32,7 @@
// Lucene imports
import org.apache.lucene.analysis.Token;
+import org.apache.nutch.util.NutchConf;
/**
@@ -204,7 +205,7 @@
public void testIdentify() {
try {
long total = 0;
- LanguageIdentifier idfr = LanguageIdentifier.getInstance();
+ LanguageIdentifier idfr = new LanguageIdentifier(new NutchConf());
BufferedReader in = new BufferedReader(new InputStreamReader(
this.getClass().getResourceAsStream("test-referencial.txt")));
String line = null;
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Jan 31 08:08:58 2006
@@ -24,6 +24,7 @@
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
+
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.io.UTF8;
import org.apache.nutch.net.protocols.Response;
@@ -43,35 +44,46 @@
public static final int BUFFER_SIZE = 8 * 1024;
- public static String PROXY_HOST =
- NutchConf.get().get("http.proxy.host");
-
- public static int PROXY_PORT =
- NutchConf.get().getInt("http.proxy.port", 8080);
-
- public static boolean PROXY =
- (PROXY_HOST != null && PROXY_HOST.length() > 0);
-
- public static int TIMEOUT =
- NutchConf.get().getInt("http.timeout", 10000);
-
- public static int MAX_CONTENT =
- NutchConf.get().getInt("http.content.limit", 64 * 1024);
-
- public static int MAX_DELAYS =
- NutchConf.get().getInt("http.max.delays", 3);
-
- public static int MAX_THREADS_PER_HOST =
- NutchConf.get().getInt("fetcher.threads.per.host", 1);
-
- public static String AGENT_STRING =
- getAgentString();
-
- public static long SERVER_DELAY =
- (long) (NutchConf.get().getFloat("fetcher.server.delay", 1.0f) * 1000);
-
-
private static final byte[] EMPTY_CONTENT = new byte[0];
+
+ private RobotRulesParser robots = null;
+
+ /** The proxy hostname. */
+ protected String proxyHost = null;
+
+ /** The proxy port. */
+ protected int proxyPort = 8080;
+
+ /** Indicates if a proxy is used */
+ protected boolean useProxy = false;
+
+ /** The network timeout in millisecond */
+ protected int timeout = 10000;
+
+ /** The length limit for downloaded content, in bytes. */
+ protected int maxContent = 64 * 1024;
+
+ /** The number of times a thread will delay when trying to fetch a page. */
+ protected int maxDelays = 3;
+
+ /**
+ * The maximum number of threads that should be allowed
+ * to access a host at one time.
+ */
+ protected int maxThreadsPerHost = 1;
+
+ /**
+ * The number of seconds the fetcher will delay between
+ * successive requests to the same server.
+ */
+ protected long serverDelay = 1000;
+
+ /** The Nutch 'User-Agent' request header */
+ protected String userAgent = getAgentString(
+ "NutchCVS", null, "Nutch",
+ "http://lucene.apache.org/nutch/bot.html",
+ "nutch-agent@lucene.apache.org");
+
/**
* Maps from InetAddress to a Long naming the time it should be unblocked.
@@ -97,7 +109,10 @@
/** The specified logger */
private Logger logger = LOGGER;
-
+
+ /** The nutch configuration */
+ private NutchConf conf = null;
+
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -109,14 +124,32 @@
if (logger != null) {
this.logger = logger;
}
- logger.fine("http.proxy.host = " + PROXY_HOST);
- logger.fine("http.proxy.port = " + PROXY_PORT);
- logger.fine("http.timeout = " + TIMEOUT);
- logger.fine("http.content.limit = " + MAX_CONTENT);
- logger.fine("http.agent = " + AGENT_STRING);
- logger.fine("fetcher.server.delay = " + SERVER_DELAY);
- logger.fine("http.max.delays = " + MAX_DELAYS);
+ robots = new RobotRulesParser();
+ }
+
+ // Inherited Javadoc
+ public void setConf(NutchConf conf) {
+ this.conf = conf;
+ this.proxyHost = conf.get("http.proxy.host");
+ this.proxyPort = conf.getInt("http.proxy.port", 8080);
+ this.useProxy = (proxyHost != null && proxyHost.length() > 0);
+ this.timeout = conf.getInt("http.timeout", 10000);
+ this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+ this.maxDelays = conf.getInt("http.max.delays", 3);
+ this.maxThreadsPerHost = conf.getInt("fetcher.threads.per.host", 1);
+ this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
+ .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
+ this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
+ this.robots.setConf(conf);
+ logConf();
+ }
+
+ // Inherited Javadoc
+ public NutchConf getConf() {
+ return this.conf;
}
+
+
public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
@@ -125,7 +158,7 @@
URL u = new URL(urlString);
try {
- if (!RobotRulesParser.isAllowed(this, u)) {
+ if (!robots.isAllowed(this, u)) {
return new ProtocolOutput(null, new ProtocolStatus(ProtocolStatus.ROBOTS_DENIED, url));
}
} catch (Throwable e) {
@@ -146,7 +179,7 @@
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
response.getHeader("Content-Type"),
- response.getHeaders());
+ response.getHeaders(), this.conf);
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
@@ -203,8 +236,49 @@
}
}
-
- private static InetAddress blockAddr(URL url) throws ProtocolException {
+ /* -------------------------- *
+ * </implementation:Protocol> *
+ * -------------------------- */
+
+
+ public String getProxyHost() {
+ return proxyHost;
+ }
+
+ public int getProxyPort() {
+ return proxyPort;
+ }
+
+ public boolean useProxy() {
+ return useProxy;
+ }
+
+ public int getTimeout() {
+ return timeout;
+ }
+
+ public int getMaxContent() {
+ return maxContent;
+ }
+
+ public int getMaxDelays() {
+ return maxDelays;
+ }
+
+ public int getMaxThreadsPerHost() {
+ return maxThreadsPerHost;
+ }
+
+ public long getServerDelay() {
+ return serverDelay;
+ }
+
+ public String getUserAgent() {
+ return userAgent;
+ }
+
+
+ private InetAddress blockAddr(URL url) throws ProtocolException {
InetAddress addr;
try {
@@ -229,21 +303,21 @@
count++; // increment & store
THREADS_PER_HOST_COUNT.put(addr, new Integer(count));
- if (count >= MAX_THREADS_PER_HOST) {
+ if (count >= maxThreadsPerHost) {
BLOCKED_ADDR_TO_TIME.put(addr, new Long(0)); // block it
}
return addr;
}
}
- if (delays == MAX_DELAYS)
+ if (delays == maxDelays)
throw new HttpException("Exceeded http.max.delays: retry later.");
long done = time.longValue();
long now = System.currentTimeMillis();
long sleep = 0;
if (done == 0) { // address is still in use
- sleep = SERVER_DELAY; // wait at least delay
+ sleep = serverDelay; // wait at least delay
} else if (now < done) { // address is on hold
sleep = done - now; // wait until its free
@@ -256,14 +330,14 @@
}
}
- private static void unblockAddr(InetAddress addr) {
+ private void unblockAddr(InetAddress addr) {
synchronized (BLOCKED_ADDR_TO_TIME) {
int addrCount = ((Integer)THREADS_PER_HOST_COUNT.get(addr)).intValue();
if (addrCount == 1) {
THREADS_PER_HOST_COUNT.remove(addr);
BLOCKED_ADDR_QUEUE.addFirst(addr);
BLOCKED_ADDR_TO_TIME.put
- (addr, new Long(System.currentTimeMillis()+SERVER_DELAY));
+ (addr, new Long(System.currentTimeMillis() + serverDelay));
} else {
THREADS_PER_HOST_COUNT.put(addr, new Integer(addrCount - 1));
}
@@ -285,13 +359,11 @@
}
}
- private static String getAgentString() {
-
- String agentName = NutchConf.get().get("http.agent.name");
- String agentVersion = NutchConf.get().get("http.agent.version");
- String agentDesc = NutchConf.get().get("http.agent.description");
- String agentURL = NutchConf.get().get("http.agent.url");
- String agentEmail = NutchConf.get().get("http.agent.email");
+ private static String getAgentString(String agentName,
+ String agentVersion,
+ String agentDesc,
+ String agentURL,
+ String agentEmail) {
if ( (agentName == null) || (agentName.trim().length() == 0) )
LOGGER.severe("No User-Agent string set (http.agent.name)!");
@@ -327,6 +399,16 @@
}
return buf.toString();
}
+
+ protected void logConf() {
+ logger.info("http.proxy.host = " + proxyHost);
+ logger.info("http.proxy.port = " + proxyPort);
+ logger.info("http.timeout = " + timeout);
+ logger.info("http.content.limit = " + maxContent);
+ logger.info("http.agent = " + userAgent);
+ logger.info("fetcher.server.delay = " + serverDelay);
+ logger.info("http.max.delays = " + maxDelays);
+ }
protected static void main(HttpBase http, String[] args) throws Exception {
boolean verbose = false;
@@ -341,7 +423,7 @@
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
- TIMEOUT = Integer.parseInt(args[++i]) * 1000;
+ http.timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) { // found -verbose option
verbose = true;
} else if (i != args.length - 1) {
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Jan 31 08:08:58 2006
@@ -30,6 +30,7 @@
// Nutch imports
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
@@ -45,14 +46,13 @@
* @author Mike Cafarella
* @author Doug Cutting
*/
-public class RobotRulesParser {
+public class RobotRulesParser implements NutchConfigurable {
+
public static final Logger LOG=
- LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+ LogFormatter.getLogger(RobotRulesParser.class.getName());
- private static final boolean ALLOW_FORBIDDEN =
- NutchConf.get().getBoolean("http.robots.403.allow", false);
+ private boolean allowForbidden = false;
- private static final String[] AGENTS = getAgents();
private static final Hashtable CACHE = new Hashtable();
private static final String CHARACTER_ENCODING= "UTF-8";
@@ -60,9 +60,9 @@
private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
- private static RobotRuleSet FORBID_ALL_RULES =
- new RobotRulesParser().getForbidAllRules();
+ private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();
+ private NutchConf conf;
private HashMap robotNames;
/**
@@ -87,14 +87,6 @@
}
/**
- * should not be instantiated from outside RobotRulesParser
- */
- private RobotRuleSet() {
- tmpEntries= new ArrayList();
- entries= null;
- }
-
- /**
*/
private void addPrefix(String prefix, boolean allow) {
if (tmpEntries == null) {
@@ -182,14 +174,25 @@
}
- public RobotRulesParser() { this(AGENTS); }
+ RobotRulesParser() { }
+
+ public RobotRulesParser(NutchConf conf) {
+ setConf(conf);
+ }
- private static String[] getAgents() {
+
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+ public void setConf(NutchConf conf) {
+ this.conf = conf;
+ allowForbidden = conf.getBoolean("http.robots.403.allow", false);
//
// Grab the agent names we advertise to robots files.
//
- String agentName = NutchConf.get().get("http.agent.name");
- String agentNames = NutchConf.get().get("http.robots.agents");
+ String agentName = conf.get("http.agent.name");
+ String agentNames = conf.get("http.robots.agents");
StringTokenizer tok = new StringTokenizer(agentNames, ",");
ArrayList agents = new ArrayList();
while (tok.hasMoreTokens()) {
@@ -197,22 +200,38 @@
}
//
- // If there are no agents for robots-parsing, use our
+ // If there are no agents for robots-parsing, use our
// default agent-string. If both are present, our agent-string
// should be the first one we advertise to robots-parsing.
- //
+ //
if (agents.size() == 0) {
agents.add(agentName);
LOG.severe("No agents listed in 'http.robots.agents' property!");
} else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
agents.add(0, agentName);
- LOG.severe("Agent we advertise (" + agentName
+ LOG.severe("Agent we advertise (" + agentName
+ ") not listed first in 'http.robots.agents' property!");
}
+ setRobotNames((String[]) agents.toArray(new String[agents.size()]));
+ }
- return (String[])agents.toArray(new String[agents.size()]);
+ public NutchConf getConf() {
+ return conf;
}
+ /* ---------------------------------- *
+ * <implementation:NutchConfigurable> *
+ * ---------------------------------- */
+
+ private void setRobotNames(String[] robotNames) {
+ this.robotNames= new HashMap();
+ for (int i= 0; i < robotNames.length; i++) {
+ this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
+ }
+ // always make sure "*" is included
+ if (!this.robotNames.containsKey("*"))
+ this.robotNames.put("*", new Integer(robotNames.length));
+ }
/**
* Creates a new <code>RobotRulesParser</code> which will use the
@@ -223,14 +242,8 @@
* rules associated with the robot name having the smallest index
* will be used.
*/
- public RobotRulesParser(String[] robotNames) {
- this.robotNames= new HashMap();
- for (int i= 0; i < robotNames.length; i++) {
- this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
- }
- // always make sure "*" is included
- if (!this.robotNames.containsKey("*"))
- this.robotNames.put("*", new Integer(robotNames.length));
+ RobotRulesParser(String[] robotNames) {
+ setRobotNames(robotNames);
}
/**
@@ -368,7 +381,7 @@
return rules;
}
- public static boolean isAllowed(HttpBase http, URL url)
+ public boolean isAllowed(HttpBase http, URL url)
throws ProtocolException, IOException {
String host = url.getHost();
@@ -382,8 +395,8 @@
new CrawlDatum(), true);
if (response.getCode() == 200) // found rules: parse them
- robotRules = new RobotRulesParser().parseRules(response.getContent());
- else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
+ robotRules = parseRules(response.getContent());
+ else if ( (response.getCode() == 403) && (!allowForbidden) )
robotRules = FORBID_ALL_RULES; // use forbid all
else
robotRules = EMPTY_RULES; // use default rules
Modified: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java Tue Jan 31 08:08:58 2006
@@ -325,9 +325,10 @@
public static void main( String[] args ) throws Exception {
- Ontology ontology = OntologyFactory.getOntology();
+ NutchConf nutchConf = new NutchConf();
+ Ontology ontology = new OntologyFactory(nutchConf).getOntology();
- String urls = NutchConf.get().get("extension.ontology.urls");
+ String urls = nutchConf.get("extension.ontology.urls");
if (urls==null || urls.trim().equals("")) {
LOG.severe("No ontology url found.");
return;
Modified: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java Tue Jan 31 08:08:58 2006
@@ -25,6 +25,7 @@
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.util.NutchConf;
import junit.framework.TestCase;
@@ -50,12 +51,14 @@
private String[] sampleFiles = {"time.owl"};
private static Ontology ontology;
-
+ private NutchConf nutchConf;
public TestOntology(String name) {
super(name);
}
- protected void setUp() {}
+ protected void setUp() {
+ this.nutchConf = new NutchConf();
+ }
protected void tearDown() {}
@@ -66,7 +69,7 @@
if (ontology==null) {
try {
- ontology = OntologyFactory.getOntology();
+ ontology = new OntologyFactory(this.nutchConf).getOntology();
} catch (Exception e) {
throw new Exception("Failed to instantiate ontology");
}