You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2006/02/04 01:39:32 UTC
svn commit: r374796 [3/5] - in /lucene/nutch/trunk: bin/ conf/ lib/
lib/jetty-ext/ src/java/org/apache/nutch/analysis/
src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/f...
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Fri Feb 3 16:38:32 2006
@@ -25,9 +25,12 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.ipc.RPC;
+
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.ipc.RPC;
+
+import org.apache.nutch.util.NutchConfiguration;
/** Implements the search API over IPC connnections. */
public class DistributedSearch {
@@ -61,10 +64,10 @@
int port = Integer.parseInt(args[0]);
File directory = new File(args[1]);
- NutchConf nutchConf = new NutchConf();
- NutchBean bean = new NutchBean(nutchConf, directory);
+ Configuration conf = NutchConfiguration.create();
+ NutchBean bean = new NutchBean(conf, directory);
- org.apache.nutch.ipc.Server server = RPC.getServer(bean, port, 10, true, nutchConf);
+ org.apache.hadoop.ipc.Server server = RPC.getServer(bean, port, 10, true, conf);
server.start();
server.join();
}
@@ -81,15 +84,15 @@
private HashMap segmentToAddress = new HashMap();
private boolean running = true;
- private NutchConf nutchConf;
+ private Configuration conf;
/** Construct a client talking to servers listed in the named file.
* Each line in the file lists a server hostname and port, separated by
* whitespace.
*/
- public Client(File file, NutchConf nutchConf) throws IOException {
- this(readConfig(file), nutchConf);
+ public Client(File file, Configuration conf) throws IOException {
+ this(readConfig(file), conf);
}
private static InetSocketAddress[] readConfig(File config)
@@ -113,12 +116,12 @@
}
/** Construct a client talking to the named servers. */
- public Client(InetSocketAddress[] addresses, NutchConf nutchConf) throws IOException {
+ public Client(InetSocketAddress[] addresses, Configuration conf) throws IOException {
this.defaultAddresses = addresses;
updateSegments();
setDaemon(true);
start();
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
private static final Method GET_SEGMENTS;
@@ -155,7 +158,7 @@
// build segmentToAddress map
Object[][] params = new Object[defaultAddresses.length][0];
String[][] results =
- (String[][])RPC.call(GET_SEGMENTS, params, defaultAddresses, this.nutchConf);
+ (String[][])RPC.call(GET_SEGMENTS, params, defaultAddresses, this.conf);
for (int i = 0; i < results.length; i++) { // process results of call
InetSocketAddress addr = defaultAddresses[i];
@@ -199,7 +202,7 @@
params[i][3] = sortField;
params[i][4] = Boolean.valueOf(reverse);
}
- Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.nutchConf);
+ Hits[] results = (Hits[])RPC.call(SEARCH, params, liveAddresses, this.conf);
TreeSet queue; // cull top hits from results
@@ -238,13 +241,13 @@
private Protocol getRemote(Hit hit) {
return (Protocol)
- RPC.getProxy(Protocol.class, liveAddresses[hit.getIndexNo()], nutchConf);
+ RPC.getProxy(Protocol.class, liveAddresses[hit.getIndexNo()], conf);
}
private Protocol getRemote(HitDetails hit) {
InetSocketAddress address =
(InetSocketAddress)segmentToAddress.get(hit.getValue("segment"));
- return (Protocol)RPC.getProxy(Protocol.class, address, nutchConf);
+ return (Protocol)RPC.getProxy(Protocol.class, address, conf);
}
public String getExplanation(Query query, Hit hit) throws IOException {
@@ -262,7 +265,7 @@
addrs[i] = liveAddresses[hits[i].getIndexNo()];
params[i][0] = hits[i];
}
- return (HitDetails[])RPC.call(DETAILS, params, addrs, nutchConf);
+ return (HitDetails[])RPC.call(DETAILS, params, addrs, conf);
}
@@ -281,7 +284,7 @@
params[i][0] = hit;
params[i][1] = query;
}
- return (String[])RPC.call(SUMMARY, params, addrs, nutchConf);
+ return (String[])RPC.call(SUMMARY, params, addrs, conf);
}
public byte[] getContent(HitDetails hit) throws IOException {
@@ -316,7 +319,7 @@
System.exit(-1);
}
- Query query = Query.parse(args[0], new NutchConf());
+ Query query = Query.parse(args[0], NutchConfiguration.create());
InetSocketAddress[] addresses = new InetSocketAddress[(args.length-1)/2];
for (int i = 0; i < (args.length-1)/2; i++) {
@@ -324,7 +327,7 @@
new InetSocketAddress(args[i*2+1], Integer.parseInt(args[i*2+2]));
}
- Client client = new Client(addresses, new NutchConf());
+ Client client = new Client(addresses, NutchConfiguration.create());
//client.setTimeout(Integer.MAX_VALUE);
Hits hits = client.search(query, 10, null, null, false);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Fri Feb 3 16:38:32 2006
@@ -21,13 +21,13 @@
import java.util.HashMap;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.mapred.*;
-import org.apache.nutch.mapred.lib.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.*;
import org.apache.nutch.crawl.*;
/** Implements {@link HitSummarizer} and {@link HitContent} for a set of
@@ -37,19 +37,19 @@
private static class Segment {
private static final Partitioner PARTITIONER = new HashPartitioner();
- private NutchFileSystem nfs;
+ private FileSystem fs;
private File segmentDir;
private MapFile.Reader[] content;
private MapFile.Reader[] parseText;
private MapFile.Reader[] parseData;
private MapFile.Reader[] crawl;
- private NutchConf nutchConf;
+ private Configuration conf;
- public Segment(NutchFileSystem nfs, File segmentDir, NutchConf nutchConf) throws IOException {
- this.nfs = nfs;
+ public Segment(FileSystem fs, File segmentDir, Configuration conf) throws IOException {
+ this.fs = fs;
this.segmentDir = segmentDir;
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
public CrawlDatum getCrawlDatum(UTF8 url) throws IOException {
@@ -85,7 +85,7 @@
}
private MapFile.Reader[] getReaders(String subDir) throws IOException {
- return MapFileOutputFormat.getReaders(nfs, new File(segmentDir, subDir), this.nutchConf);
+ return MapFileOutputFormat.getReaders(fs, new File(segmentDir, subDir), this.conf);
}
private Writable getEntry(MapFile.Reader[] readers, UTF8 url,
@@ -101,20 +101,20 @@
private Summarizer summarizer;
/** Construct given a directory containing fetcher output. */
- public FetchedSegments(NutchFileSystem nfs, String segmentsDir, NutchConf nutchConf) throws IOException {
- File[] segmentDirs = nfs.listFiles(new File(segmentsDir));
- this.sumContext = nutchConf.getInt("searcher.summary.context", 5);
- this.sumLength = nutchConf.getInt("searcher.summary.length", 20);
- this.summarizer = new Summarizer(nutchConf);
+ public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException {
+ File[] segmentDirs = fs.listFiles(new File(segmentsDir));
+ this.sumContext = conf.getInt("searcher.summary.context", 5);
+ this.sumLength = conf.getInt("searcher.summary.length", 20);
+ this.summarizer = new Summarizer(conf);
if (segmentDirs != null) {
for (int i = 0; i < segmentDirs.length; i++) {
File segmentDir = segmentDirs[i];
// File indexdone = new File(segmentDir, IndexSegment.DONE_NAME);
-// if (nfs.exists(indexdone) && nfs.isFile(indexdone)) {
-// segments.put(segmentDir.getName(), new Segment(nfs, segmentDir));
+// if (fs.exists(indexdone) && fs.isFile(indexdone)) {
+// segments.put(segmentDir.getName(), new Segment(fs, segmentDir));
// }
- segments.put(segmentDir.getName(), new Segment(nfs, segmentDir, nutchConf));
+ segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf));
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java Fri Feb 3 16:38:32 2006
@@ -25,14 +25,14 @@
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Phrase;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
/** Translate query fields to search the same-named field, as indexed by an
* IndexingFilter. Best for tokenized fields. */
public abstract class FieldQueryFilter implements QueryFilter {
private String field;
private float boost = 1.0f;
- private NutchConf nutchConf;
+ private Configuration conf;
private CommonGrams commonGrams;
/** Construct for the named field.*/
@@ -93,12 +93,12 @@
return output;
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
this.commonGrams = new CommonGrams(conf);
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java Fri Feb 3 16:38:32 2006
@@ -20,11 +20,11 @@
import java.io.DataOutput;
import java.io.IOException;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.WritableComparable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/** A document which matched a query in an index. */
public class Hit implements Writable, Comparable {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitDetails.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitDetails.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitDetails.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitDetails.java Fri Feb 3 16:38:32 2006
@@ -21,9 +21,9 @@
import java.io.IOException;
import java.util.logging.Logger;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
import org.apache.nutch.html.Entities;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/** Data stored in the index for a hit.
*
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java Fri Feb 3 16:38:32 2006
@@ -20,12 +20,12 @@
import java.io.DataOutput;
import java.io.IOException;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.WritableComparable;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.UTF8;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/** A set of hits matching a query. */
public final class Hits implements Writable {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Fri Feb 3 16:38:32 2006
@@ -36,9 +36,9 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
import org.apache.nutch.indexer.*;
/** Implements {@link Searcher} and {@link HitDetailer} for either a single
@@ -48,42 +48,42 @@
private org.apache.lucene.search.Searcher luceneSearcher;
private org.apache.lucene.index.IndexReader reader;
private LuceneQueryOptimizer optimizer;
- private NutchFileSystem fs;
- private NutchConf nutchConf;
+ private FileSystem fs;
+ private Configuration conf;
private QueryFilters queryFilters;
/** Construct given a number of indexes. */
- public IndexSearcher(File[] indexDirs, NutchConf nutchConf) throws IOException {
+ public IndexSearcher(File[] indexDirs, Configuration conf) throws IOException {
IndexReader[] readers = new IndexReader[indexDirs.length];
- this.nutchConf = nutchConf;
- this.fs = NutchFileSystem.get(nutchConf);
+ this.conf = conf;
+ this.fs = FileSystem.get(conf);
for (int i = 0; i < indexDirs.length; i++) {
readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
}
- init(new MultiReader(readers), nutchConf);
+ init(new MultiReader(readers), conf);
}
/** Construct given a single merged index. */
- public IndexSearcher(File index, NutchConf nutchConf)
+ public IndexSearcher(File index, Configuration conf)
throws IOException {
- this.nutchConf = nutchConf;
- this.fs = NutchFileSystem.get(nutchConf);
- init(IndexReader.open(getDirectory(index)), nutchConf);
+ this.conf = conf;
+ this.fs = FileSystem.get(conf);
+ init(IndexReader.open(getDirectory(index)), conf);
}
- private void init(IndexReader reader, NutchConf nutchConf) throws IOException {
+ private void init(IndexReader reader, Configuration conf) throws IOException {
this.reader = reader;
this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
this.luceneSearcher.setSimilarity(new NutchSimilarity());
- this.optimizer = new LuceneQueryOptimizer(nutchConf);
- this.queryFilters = new QueryFilters(nutchConf);
+ this.optimizer = new LuceneQueryOptimizer(conf);
+ this.queryFilters = new QueryFilters(conf);
}
private Directory getDirectory(File file) throws IOException {
if ("local".equals(this.fs.getName())) {
return FSDirectory.getDirectory(file, false);
} else {
- return new NdfsDirectory(this.fs, file, false, this.nutchConf);
+ return new FsDirectory(this.fs, file, false, this.conf);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java Fri Feb 3 16:38:32 2006
@@ -9,9 +9,9 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.LinkDbReader;
-import org.apache.nutch.fs.NutchFileSystem;
-import org.apache.nutch.io.UTF8;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.conf.Configuration;
import java.io.File;
@@ -19,8 +19,8 @@
private LinkDbReader linkdb = null;
- public LinkDbInlinks(NutchFileSystem fs, File dir, NutchConf nutchConf) {
- linkdb = new LinkDbReader(fs, dir, nutchConf);
+ public LinkDbInlinks(FileSystem fs, File dir, Configuration conf) {
+ linkdb = new LinkDbReader(fs, dir, conf);
}
public String[] getAnchors(HitDetails details) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Fri Feb 3 16:38:32 2006
@@ -22,7 +22,7 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.misc.ChainedFilter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -72,11 +72,11 @@
* @param threshold
* the fraction of documents which must contain a term
*/
- public LuceneQueryOptimizer(NutchConf nutchConf) {
- final int cacheSize = nutchConf.getInt("searcher.filter.cache.size", 16);
- this.threshold = nutchConf.getFloat("searcher.filter.cache.threshold",
+ public LuceneQueryOptimizer(Configuration conf) {
+ final int cacheSize = conf.getInt("searcher.filter.cache.size", 16);
+ this.threshold = conf.getFloat("searcher.filter.cache.threshold",
0.05f);
- this.searcherMaxHits = nutchConf.getInt("searcher.max.hits", -1);
+ this.searcherMaxHits = conf.getInt("searcher.max.hits", -1);
this.searcherMaxHits = searcherMaxHits;
this.cache = new LinkedHashMap(cacheSize, 0.75f, true) {
protected boolean removeEldestEntry(Map.Entry eldest) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Fri Feb 3 16:38:32 2006
@@ -21,11 +21,13 @@
import java.util.logging.Logger;
import javax.servlet.ServletContext;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
import org.apache.nutch.parse.*;
import org.apache.nutch.indexer.*;
import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.util.NutchConfiguration;
/**
* One stop shopping for search-related functionality.
@@ -55,12 +57,12 @@
* don't want to use too many of those. */
private static final int MAX_PROHIBITED_TERMS = 20;
- private NutchConf nutchConf;
+ private Configuration conf;
- private NutchFileSystem fs;
+ private FileSystem fs;
/** Cache in servlet context. */
- public static NutchBean get(ServletContext app, NutchConf conf) throws IOException {
+ public static NutchBean get(ServletContext app, Configuration conf) throws IOException {
NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
if (bean == null) {
LOG.info("creating new bean");
@@ -73,29 +75,29 @@
/**
*
- * @param nutchConf
+ * @param conf
* @throws IOException
*/
- public NutchBean(NutchConf nutchConf) throws IOException {
- this(nutchConf, null);
+ public NutchBean(Configuration conf) throws IOException {
+ this(conf, null);
}
/**
* Construct in a named directory.
- * @param nutchConf
+ * @param conf
* @param dir
* @throws IOException
*/
- public NutchBean(NutchConf nutchConf, File dir) throws IOException {
- this.nutchConf = nutchConf;
- this.fs = NutchFileSystem.get(this.nutchConf);
+ public NutchBean(Configuration conf, File dir) throws IOException {
+ this.conf = conf;
+ this.fs = FileSystem.get(this.conf);
if (dir == null) {
- dir = new File(this.nutchConf.get("searcher.dir", "crawl"));
+ dir = new File(this.conf.get("searcher.dir", "crawl"));
}
File servers = new File(dir, "search-servers.txt");
if (fs.exists(servers)) {
LOG.info("searching servers in " + servers.getCanonicalPath());
- init(new DistributedSearch.Client(servers, nutchConf));
+ init(new DistributedSearch.Client(servers, conf));
} else {
init(new File(dir, "index"), new File(dir, "indexes"), new File(
dir, "segments"), new File(dir, "linkdb"));
@@ -108,7 +110,7 @@
IndexSearcher indexSearcher;
if (this.fs.exists(indexDir)) {
LOG.info("opening merged index in " + indexDir);
- indexSearcher = new IndexSearcher(indexDir, this.nutchConf);
+ indexSearcher = new IndexSearcher(indexDir, this.conf);
} else {
LOG.info("opening indexes in " + indexesDir);
@@ -127,11 +129,11 @@
directories[i]=(File)vDirs.remove(0);
}
- indexSearcher = new IndexSearcher(directories, this.nutchConf);
+ indexSearcher = new IndexSearcher(directories, this.conf);
}
LOG.info("opening segments in " + segmentsDir);
- FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.nutchConf);
+ FetchedSegments segments = new FetchedSegments(this.fs, segmentsDir.toString(),this.conf);
this.segmentNames = segments.getSegmentNames();
@@ -141,7 +143,7 @@
this.content = segments;
LOG.info("opening linkdb in " + linkDb);
- this.linkDb = new LinkDbInlinks(fs, linkDb, this.nutchConf);
+ this.linkDb = new LinkDbInlinks(fs, linkDb, this.conf);
}
private void init(DistributedSearch.Client client) {
@@ -230,7 +232,7 @@
if (maxHitsPerDup <= 0) // disable dup checking
return search(query, numHits, dedupField, sortField, reverse);
- float rawHitsFactor = this.nutchConf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
+ float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
int numHitsRaw = (int)(numHits * rawHitsFactor);
LOG.info("searching for "+numHitsRaw+" raw hits");
Hits hits = searcher.search(query, numHitsRaw,
@@ -359,9 +361,9 @@
System.exit(-1);
}
- NutchConf nutchConf = new NutchConf();
- NutchBean bean = new NutchBean(nutchConf);
- Query query = Query.parse(args[0], nutchConf);
+ Configuration conf = NutchConfiguration.create();
+ NutchBean bean = new NutchBean(conf);
+ Query query = Query.parse(args[0], conf);
Hits hits = bean.search(query, 10);
System.out.println("Total hits: " + hits.getTotal());
int length = (int)Math.min(hits.getTotal(), 10);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Fri Feb 3 16:38:32 2006
@@ -32,7 +32,7 @@
import javax.xml.parsers.*;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.w3c.dom.*;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.Transformer;
@@ -57,12 +57,12 @@
}
private NutchBean bean;
- private NutchConf nutchConf;
+ private Configuration conf;
- public void init(ServletConfig config, NutchConf nutchConf) throws ServletException {
+ public void init(ServletConfig config, Configuration conf) throws ServletException {
try {
- bean = NutchBean.get(config.getServletContext(), nutchConf);
- this.nutchConf = nutchConf;
+ bean = NutchBean.get(config.getServletContext(), conf);
+ this.conf = conf;
} catch (IOException e) {
throw new ServletException(e);
}
@@ -118,7 +118,7 @@
(sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
(dedupField == null ? "" : "&dedupField=" + dedupField));
- Query query = Query.parse(queryString, this.nutchConf);
+ Query query = Query.parse(queryString, this.conf);
NutchBean.LOG.info("query: " + queryString);
// execute the query
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java Fri Feb 3 16:38:32 2006
@@ -25,11 +25,11 @@
import java.util.ArrayList;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.analysis.NutchAnalysis;
-
-import org.apache.nutch.io.Writable;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Writable;
/** A Nutch query. */
public final class Query implements Writable, Cloneable {
@@ -50,32 +50,32 @@
private float weight = 1.0f;
private Object termOrPhrase;
- private NutchConf nutchConf;
+ private Configuration conf;
public Clause(Term term, String field,
- boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
- this(term, isRequired, isProhibited, nutchConf);
+ boolean isRequired, boolean isProhibited, Configuration conf) {
+ this(term, isRequired, isProhibited, conf);
this.field = field;
}
- public Clause(Term term, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+ public Clause(Term term, boolean isRequired, boolean isProhibited, Configuration conf) {
this.isRequired = isRequired;
this.isProhibited = isProhibited;
this.termOrPhrase = term;
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
public Clause(Phrase phrase, String field,
- boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
- this(phrase, isRequired, isProhibited, nutchConf);
+ boolean isRequired, boolean isProhibited, Configuration conf) {
+ this(phrase, isRequired, isProhibited, conf);
this.field = field;
}
- public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, NutchConf nutchConf) {
+ public Clause(Phrase phrase, boolean isRequired, boolean isProhibited, Configuration conf) {
this.isRequired = isRequired;
this.isProhibited = isProhibited;
this.termOrPhrase = phrase;
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
public boolean isRequired() { return isRequired; }
@@ -109,7 +109,7 @@
getTerm().write(out);
}
- public static Clause read(DataInput in, NutchConf nutchConf) throws IOException {
+ public static Clause read(DataInput in, Configuration conf) throws IOException {
byte bits = in.readByte();
boolean required = ((bits & REQUIRED_BIT) != 0);
boolean prohibited = ((bits & PROHIBITED_BIT) != 0);
@@ -119,9 +119,9 @@
Clause clause;
if ((bits & PHRASE_BIT) == 0) {
- clause = new Clause(Term.read(in), field, required, prohibited, nutchConf);
+ clause = new Clause(Term.read(in), field, required, prohibited, conf);
} else {
- clause = new Clause(Phrase.read(in), field, required, prohibited, nutchConf);
+ clause = new Clause(Phrase.read(in), field, required, prohibited, conf);
}
clause.weight = weight;
return clause;
@@ -140,7 +140,7 @@
buffer.append(":");
}
- if (!isPhrase() && new QueryFilters(nutchConf).isRawField(field)) {
+ if (!isPhrase() && new QueryFilters(conf).isRawField(field)) {
buffer.append('"'); // quote raw terms
buffer.append(termOrPhrase.toString());
buffer.append('"');
@@ -279,12 +279,12 @@
private ArrayList clauses = new ArrayList();
- private NutchConf nutchConf;
+ private Configuration conf;
private static final Clause[] CLAUSES_PROTO = new Clause[0];
- public Query(NutchConf nutchConf) {
- this.nutchConf = nutchConf;
+ public Query(Configuration conf) {
+ this.conf = conf;
}
/** Return all clauses. */
@@ -299,7 +299,7 @@
/** Add a required term in a specified field. */
public void addRequiredTerm(String term, String field) {
- clauses.add(new Clause(new Term(term), field, true, false, this.nutchConf));
+ clauses.add(new Clause(new Term(term), field, true, false, this.conf));
}
/** Add a prohibited term in the default field. */
@@ -309,7 +309,7 @@
/** Add a prohibited term in the specified field. */
public void addProhibitedTerm(String term, String field) {
- clauses.add(new Clause(new Term(term), field, false, true, this.nutchConf));
+ clauses.add(new Clause(new Term(term), field, false, true, this.conf));
}
/** Add a required phrase in the default field. */
@@ -323,7 +323,7 @@
} else if (terms.length == 1) {
addRequiredTerm(terms[0], field); // optimize to term query
} else {
- clauses.add(new Clause(new Phrase(terms), field, true, false, this.nutchConf));
+ clauses.add(new Clause(new Phrase(terms), field, true, false, this.conf));
}
}
@@ -338,7 +338,7 @@
} else if (terms.length == 1) {
addProhibitedTerm(terms[0], field); // optimize to term query
} else {
- clauses.add(new Clause(new Phrase(terms), field, false, true, this.nutchConf));
+ clauses.add(new Clause(new Phrase(terms), field, false, true, this.conf));
}
}
@@ -348,8 +348,8 @@
((Clause)clauses.get(i)).write(out);
}
- public static Query read(DataInput in, NutchConf nutchConf) throws IOException {
- Query result = new Query(nutchConf);
+ public static Query read(DataInput in, Configuration conf) throws IOException {
+ Query result = new Query(conf);
result.readFields(in);
return result;
}
@@ -358,7 +358,7 @@
clauses.clear();
int length = in.readByte();
for (int i = 0; i < length; i++)
- clauses.add(Clause.read(in, this.nutchConf));
+ clauses.add(Clause.read(in, this.conf));
}
public String toString() {
@@ -415,18 +415,18 @@
/** Parse a query from a string. */
- public static Query parse(String queryString, NutchConf nutchConf) throws IOException {
- return fixup(NutchAnalysis.parseQuery(queryString, nutchConf), nutchConf);
+ public static Query parse(String queryString, Configuration conf) throws IOException {
+ return fixup(NutchAnalysis.parseQuery(queryString, conf), conf);
}
/** Convert clauses in unknown fields to the default field. */
- private static Query fixup(Query input, NutchConf nutchConf) {
+ private static Query fixup(Query input, Configuration conf) {
// walk the query
- Query output = new Query(nutchConf);
+ Query output = new Query(conf);
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
- if (!new QueryFilters(nutchConf).isField(c.getField())) { // unknown field
+ if (!new QueryFilters(conf).isField(c.getField())) { // unknown field
ArrayList terms = new ArrayList(); // add name to query
if (c.isPhrase()) {
terms.addAll(Arrays.asList(c.getPhrase().getTerms()));
@@ -447,13 +447,13 @@
/** For debugging. */
public static void main(String[] args) throws Exception {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
- NutchConf nutchConf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- Query query = parse(line, nutchConf);
+ Query query = parse(line, conf);
System.out.println("Parsed: " + query);
- System.out.println("Translated: " + new QueryFilters(nutchConf).filter(query));
+ System.out.println("Translated: " + new QueryFilters(conf).filter(query));
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java Fri Feb 3 16:38:32 2006
@@ -17,13 +17,13 @@
package org.apache.nutch.searcher;
import org.apache.lucene.search.BooleanQuery;
-import org.apache.nutch.util.NutchConfigurable;
+import org.apache.hadoop.conf.Configurable;
/** Extension point for query translation. Permits one to add metadata to a
* query. All plugins found which implement this extension point are run
* sequentially on the query.
*/
-public interface QueryFilter extends NutchConfigurable {
+public interface QueryFilter extends Configurable {
/** The name of the extension point. */
final static String X_POINT_ID = QueryFilter.class.getName();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java Fri Feb 3 16:38:32 2006
@@ -18,8 +18,8 @@
import org.apache.nutch.plugin.*;
import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
import java.util.logging.Logger;
import java.util.*;
@@ -48,12 +48,12 @@
return Collections.list(new StringTokenizer(fields, " ,\t\n\r"));
}
- public QueryFilters(NutchConf nutchConf) {
- this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+ public QueryFilters(Configuration conf) {
+ this.queryFilters = (QueryFilter[]) conf.getObject(QueryFilter.class
.getName());
if (this.queryFilters == null) {
try {
- ExtensionPoint point = nutchConf.getPluginRepository()
+ ExtensionPoint point = PluginRepository.get(conf)
.getExtensionPoint(QueryFilter.X_POINT_ID);
if (point == null)
throw new RuntimeException(QueryFilter.X_POINT_ID + " not found.");
@@ -73,20 +73,20 @@
filters[i] = (QueryFilter) extension.getExtensionInstance();
FIELD_NAMES.addAll(fieldNames);
FIELD_NAMES.addAll(rawFieldNames);
- nutchConf.setObject("FIELD_NAMES", FIELD_NAMES);
+ conf.setObject("FIELD_NAMES", FIELD_NAMES);
RAW_FIELD_NAMES.addAll(rawFieldNames);
- nutchConf.setObject("RAW_FIELD_NAMES", RAW_FIELD_NAMES);
+ conf.setObject("RAW_FIELD_NAMES", RAW_FIELD_NAMES);
}
- nutchConf.setObject(QueryFilter.class.getName(), filters);
+ conf.setObject(QueryFilter.class.getName(), filters);
} catch (PluginRuntimeException e) {
throw new RuntimeException(e);
}
- this.queryFilters = (QueryFilter[]) nutchConf.getObject(QueryFilter.class
+ this.queryFilters = (QueryFilter[]) conf.getObject(QueryFilter.class
.getName());
} else {
// cache already filled
- FIELD_NAMES = (HashSet) nutchConf.getObject("FIELD_NAMES");
- RAW_FIELD_NAMES = (HashSet) nutchConf.getObject("RAW_FIELD_NAMES");
+ FIELD_NAMES = (HashSet) conf.getObject("FIELD_NAMES");
+ RAW_FIELD_NAMES = (HashSet) conf.getObject("RAW_FIELD_NAMES");
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java Fri Feb 3 16:38:32 2006
@@ -23,23 +23,24 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.searcher.Summary.*;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.util.NutchConfiguration;
/** Implements hit summarization. */
public class Summarizer {
/** Converts text to tokens. */
private Analyzer ANALYZER;
- private NutchConf nutchConf;
+ private Configuration conf;
/**
* The constructor.
* @param conf
*/
- public Summarizer(NutchConf conf) {
- this.nutchConf = conf;
+ public Summarizer(Configuration conf) {
+ this.conf = conf;
this.ANALYZER = new NutchDocumentAnalyzer(conf);
}
@@ -292,7 +293,7 @@
return;
}
- Summarizer s = new Summarizer(new NutchConf());
+ Summarizer s = new Summarizer(NutchConfiguration.create());
//
// Parse the args
@@ -320,11 +321,11 @@
in.close();
}
- NutchConf nutchConf = new NutchConf();
- int sumContext = nutchConf.getInt("searcher.summary.context", 5);
- int sumLength = nutchConf.getInt("searcher.summary.length", 20);
+ Configuration conf = NutchConfiguration.create();
+ int sumContext = conf.getInt("searcher.summary.context", 5);
+ int sumLength = conf.getInt("searcher.summary.length", 20);
// Convert the query string into a proper Query
- Query query = Query.parse(queryBuf.toString(), nutchConf);
+ Query query = Query.parse(queryBuf.toString(), conf);
System.out.println("Summary: '" + s.getSummary(body.toString(), query, sumContext, sumLength) + "'");
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Fri Feb 3 16:38:32 2006
@@ -28,30 +28,31 @@
import java.util.logging.Logger;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.fs.NutchFileSystem;
-import org.apache.nutch.io.ObjectWritable;
-import org.apache.nutch.io.UTF8;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.WritableComparable;
-import org.apache.nutch.mapred.FileSplit;
-import org.apache.nutch.mapred.JobClient;
-import org.apache.nutch.mapred.JobConf;
-import org.apache.nutch.mapred.OutputCollector;
-import org.apache.nutch.mapred.RecordReader;
-import org.apache.nutch.mapred.RecordWriter;
-import org.apache.nutch.mapred.Reducer;
-import org.apache.nutch.mapred.Reporter;
-import org.apache.nutch.mapred.SequenceFileInputFormat;
-import org.apache.nutch.mapred.SequenceFileRecordReader;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.NutchConfigured;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.NutchConfiguration;
/** Dump the content of a segment. */
-public class SegmentReader extends NutchConfigured implements Reducer {
+public class SegmentReader extends Configured implements Reducer {
public static final String DIR_NAME = "segdump";
@@ -63,7 +64,7 @@
/** Wraps inputs in an {@link ObjectWritable}, to permit merging different
* types in reduce. */
public static class InputFormat extends SequenceFileInputFormat {
- public RecordReader getRecordReader(NutchFileSystem fs, FileSplit split,
+ public RecordReader getRecordReader(FileSystem fs, FileSplit split,
JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(split.toString());
@@ -85,8 +86,8 @@
/** Implements a text output format*/
public static class TextOutputFormat
- implements org.apache.nutch.mapred.OutputFormat {
- public RecordWriter getRecordWriter(final NutchFileSystem fs, JobConf job,
+ implements org.apache.hadoop.mapred.OutputFormat {
+ public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
String name) throws IOException {
final File segmentDumpFile =
@@ -113,7 +114,7 @@
super(null);
}
- public SegmentReader(NutchConf conf) {
+ public SegmentReader(Configuration conf) {
super(conf);
}
@@ -170,23 +171,23 @@
JobClient.runJob(job);
// concatenate the output
- NutchFileSystem nfs = NutchFileSystem.get(job);
+ FileSystem fs = FileSystem.get(job);
File directory = new File(job.getOutputDir(), SegmentReader.DIR_NAME);
File dumpFile = new File(directory, job.get("segment.dump.dir", "dump"));
// remove the old file
- nfs.delete(dumpFile);
- File[] files = nfs.listFiles(directory);
+ fs.delete(dumpFile);
+ File[] files = fs.listFiles(directory);
PrintWriter writer = null;
int currentReccordNumber = 0;
if (files.length > 0) {
- writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(nfs.create(dumpFile))));
+ writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(fs.create(dumpFile))));
try {
for (int i = 0 ; i < files.length; i++) {
File partFile = (File)files[i];
try {
- currentReccordNumber = append(nfs, job, partFile, writer, currentReccordNumber);
+ currentReccordNumber = append(fs, job, partFile, writer, currentReccordNumber);
} catch (IOException exception) {
LOG.warning("Couldn't copy the content of " + partFile.toString() + " into " + dumpFile.toString());
LOG.warning(exception.getMessage());
@@ -201,8 +202,8 @@
}
/** Appends two files and updates the Recno counter*/
- private int append(NutchFileSystem nfs, NutchConf conf, File src, PrintWriter writer, int currentReccordNumber) throws IOException {
- BufferedReader reader = new BufferedReader(new InputStreamReader(nfs.open(src)));
+ private int append(FileSystem fs, Configuration conf, File src, PrintWriter writer, int currentReccordNumber) throws IOException {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(src)));
try {
String line = reader.readLine();
while(line != null) {
@@ -219,8 +220,8 @@
}
public static void main(String[] args) throws Exception {
- NutchConf nutchConf = new NutchConf();
- SegmentReader segmentReader = new SegmentReader(nutchConf);
+ Configuration conf = NutchConfiguration.create();
+ SegmentReader segmentReader = new SegmentReader(conf);
String usage = "Usage: SegmentReader <segment>";
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java Fri Feb 3 16:38:32 2006
@@ -21,7 +21,7 @@
import org.apache.nutch.searcher.Hit;
import org.apache.nutch.searcher.HitDetails;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
@@ -44,9 +44,9 @@
NutchBean bean = null;
- public void init(NutchConf nutchConf) {
+ public void init(Configuration conf) {
try {
- bean = NutchBean.get(this.getServletContext(), nutchConf);
+ bean = NutchBean.get(this.getServletContext(), conf);
} catch (IOException e) {
// nothing
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Fri Feb 3 16:38:32 2006
@@ -26,10 +26,12 @@
import org.xml.sax.helpers.*;
import org.apache.xerces.util.XMLChar;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
public class DmozParser {
@@ -335,8 +337,8 @@
Pattern topicPattern = null;
Vector topics = new Vector();
- NutchConf nutchConf = new NutchConf();
- NutchFileSystem nfs = NutchFileSystem.get(nutchConf);
+ Configuration conf = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(conf);
try {
for (int i = 1; i < argv.length; i++) {
if ("-includeAdultMaterial".equals(argv[i])) {
@@ -375,7 +377,7 @@
includeAdult, skew, topicPattern);
} finally {
- nfs.close();
+ fs.close();
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java Fri Feb 3 16:38:32 2006
@@ -37,9 +37,10 @@
import java.util.Vector;
import java.util.logging.Logger;
-import org.apache.nutch.io.UTF8;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
@@ -459,9 +460,9 @@
if (qPath != null) {
is = new FileInputStream(qPath);
} else {
- NutchConf nutchConf = new NutchConf();
- qPath = nutchConf.get("prune.index.tool.queries");
- is = nutchConf.getConfResourceAsInputStream(qPath);
+ Configuration conf = NutchConfiguration.create();
+ qPath = conf.get("prune.index.tool.queries");
+ is = conf.getConfResourceAsInputStream(qPath);
}
if (is == null) {
LOG.severe("Can't load queries from " + qPath);
Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=374796&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Fri Feb 3 16:38:32 2006
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import org.apache.hadoop.conf.Configuration;
+
+/** Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources. */
+public class NutchConfiguration {
+
+ /** Create a {@link Configuration} for Nutch. */
+ public static Configuration create() {
+ Configuration conf = new Configuration();
+ addNutchResources(conf);
+ return conf;
+ }
+
+ /** Add the standard Nutch resources to {@link Configuration}. */
+ public static Configuration addNutchResources(Configuration conf) {
+ conf.addDefaultResource("nutch-default.xml");
+ conf.addFinalResource("nutch-site.xml");
+ return conf;
+ }
+}
+
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/ThreadPool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/ThreadPool.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/ThreadPool.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/ThreadPool.java Fri Feb 3 16:38:32 2006
@@ -18,7 +18,7 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/************************************************
* ThreadPool.java
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Feb 3 16:38:32 2006
@@ -44,6 +44,7 @@
<!-- Test all of the plugins. -->
<!-- ====================================================== -->
<target name="test">
+ <parallel threadCount="2">
<ant dir="creativecommons" target="test"/>
<ant dir="languageidentifier" target="test"/>
<ant dir="lib-http" target="test"/>
@@ -58,6 +59,7 @@
<!-- <ant dir="parse-rtf" target="test"/> -->
<ant dir="parse-swf" target="test"/>
<ant dir="parse-zip" target="test"/>
+ </parallel>
</target>
<!-- ====================================================== -->
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java Fri Feb 3 16:38:32 2006
@@ -16,8 +16,8 @@
package org.creativecommons.nutch;
-import org.apache.nutch.io.*;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.LogFormatter;
import org.apache.nutch.indexer.Indexer;
import org.apache.lucene.index.IndexReader;
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Fri Feb 3 16:38:32 2006
@@ -23,14 +23,14 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
import java.util.*;
import java.net.URL;
@@ -44,7 +44,7 @@
/** The name of the document field we use. */
public static String FIELD = "cc";
- private NutchConf nutchConf;
+ private Configuration conf;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -103,12 +103,12 @@
doc.add(Field.Keyword(FIELD, feature));
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Fri Feb 3 16:38:32 2006
@@ -19,7 +19,7 @@
import org.apache.nutch.parse.*;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import java.util.*;
import java.io.*;
@@ -29,7 +29,7 @@
import org.w3c.dom.*;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/** Adds metadata identifying the Creative Commons license used, if any. */
public class CCParseFilter implements HtmlParseFilter {
@@ -50,7 +50,7 @@
}
/** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, ContentProperties metadata, NutchConf nutchConf)
+ public static void walk(Node doc, URL base, ContentProperties metadata, Configuration conf)
throws ParseException {
// walk the DOM tree, scanning for license data
@@ -69,7 +69,7 @@
} else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
licenseLocation = "a";
licenseUrl = walker.anchorLicense.toString();
- } else if (nutchConf.getBoolean("creativecommons.exclude.unlicensed", false)) {
+ } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
throw new ParseException("No CC license. Excluding.");
}
@@ -249,7 +249,7 @@
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
}
- private NutchConf nutchConf;
+ private Configuration conf;
/** Adds metadata or otherwise modifies a parse of an HTML document, given
* the DOM tree of a page. */
@@ -273,11 +273,11 @@
return parse;
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java Fri Feb 3 16:38:32 2006
@@ -17,24 +17,24 @@
package org.creativecommons.nutch;
import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
/**
* Handles "cc:" query clauses, causing them to search the "cc" field indexed by
* CCIndexingFilter.
*/
public class CCQueryFilter extends RawFieldQueryFilter {
- private NutchConf nutchConf;
+ private Configuration conf;
public CCQueryFilter() {
super(CCIndexingFilter.FIELD);
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Fri Feb 3 16:38:32 2006
@@ -20,7 +20,8 @@
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
import java.util.Properties;
import java.io.*;
@@ -56,11 +57,11 @@
}
in.close();
byte[] bytes = out.toByteArray();
- NutchConf nutchConf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
Content content =
- new Content(url, url, bytes, contentType, new ContentProperties(), nutchConf);
- Parse parse = new ParseUtil(nutchConf).parseByParserId("parse-html",content);
+ new Content(url, url, bytes, contentType, new ContentProperties(), conf);
+ Parse parse = new ParseUtil(conf).parseByParserId("parse-html",content);
ContentProperties metadata = parse.getData().getMetadata();
assertEquals(license, metadata.get("License-Url"));
Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Fri Feb 3 16:38:32 2006
@@ -23,7 +23,7 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -32,8 +32,8 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
/** Adds basic searchable fields to a document. */
public class BasicIndexingFilter implements IndexingFilter {
@@ -41,7 +41,7 @@
= LogFormatter.getLogger(BasicIndexingFilter.class.getName());
private int MAX_TITLE_LENGTH;
- private NutchConf nutchConf;
+ private Configuration conf;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
@@ -89,13 +89,13 @@
return doc;
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Fri Feb 3 16:38:32 2006
@@ -33,17 +33,17 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypes;
import org.apache.nutch.util.mime.MimeTypeException;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
import java.util.logging.Logger;
import java.text.DateFormat;
@@ -245,7 +245,7 @@
// Content-Disposition: inline; filename="foo.ppt"
private PatternMatcher matcher = new Perl5Matcher();
- private NutchConf nutchConf;
+ private Configuration conf;
static Perl5Pattern patterns[] = {null, null};
static {
Perl5Compiler compiler = new Perl5Compiler();
@@ -301,14 +301,14 @@
return normalized;
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
MAGIC = conf.getBoolean("mime.type.magic", true);
MIME = MimeTypes.get(getConf().get("mime.types.file"));
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Fri Feb 3 16:38:32 2006
@@ -27,8 +27,8 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
// DOM imports
import org.w3c.dom.DocumentFragment;
@@ -69,7 +69,7 @@
}
}
- private NutchConf nutchConf;
+ private Configuration conf;
@@ -198,11 +198,11 @@
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Fri Feb 3 16:38:32 2006
@@ -35,7 +35,7 @@
// Nutch imports
import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
@@ -46,8 +46,9 @@
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.util.NutchConfiguration;
/**
@@ -95,12 +96,12 @@
/**
* Constructs a new Language Identifier.
*/
- public LanguageIdentifier(NutchConf nutchConf) {
+ public LanguageIdentifier(Configuration conf) {
// Gets ngram sizes to take into account from the Nutch Config
- minLength = nutchConf.getInt("lang.ngram.min.length",
+ minLength = conf.getInt("lang.ngram.min.length",
NGramProfile.DEFAULT_MIN_NGRAM_LENGTH);
- maxLength = nutchConf.getInt("lang.ngram.max.length",
+ maxLength = conf.getInt("lang.ngram.max.length",
NGramProfile.DEFAULT_MAX_NGRAM_LENGTH);
// Ensure the min and max values are in an acceptale range
// (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH)
@@ -110,7 +111,7 @@
minLength = Math.min(minLength, maxLength);
// Gets the value of the maximum size of data to analyze
- analyzeLength = nutchConf.getInt("lang.analyze.max.length",
+ analyzeLength = conf.getInt("lang.analyze.max.length",
DEFAULT_ANALYSIS_LENGTH);
Properties p = new Properties();
@@ -258,10 +259,10 @@
}
- NutchConf nutchConf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
String lang = null;
//LanguageIdentifier idfr = LanguageIdentifier.getInstance();
- LanguageIdentifier idfr = new LanguageIdentifier(nutchConf);
+ LanguageIdentifier idfr = new LanguageIdentifier(conf);
File f;
FileInputStream fis;
try {
@@ -279,7 +280,7 @@
break;
case IDURL:
- text = getUrlContent(filename, nutchConf);
+ text = getUrlContent(filename, conf);
lang = idfr.identify(text);
break;
@@ -335,13 +336,13 @@
* @param url
* @return contents of url
*/
- private static String getUrlContent(String url, NutchConf nutchConf) {
+ private static String getUrlContent(String url, Configuration conf) {
Protocol protocol;
try {
- protocol = new ProtocolFactory(nutchConf).getProtocol(url);
+ protocol = new ProtocolFactory(conf).getProtocol(url);
Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
String contentType = content.getContentType();
- Parser parser = new ParserFactory(nutchConf).getParser(contentType, url);
+ Parser parser = new ParserFactory(conf).getParser(contentType, url);
Parse parse = parser.getParse(content);
System.out.println("text:" + parse.getText());
return parse.getText();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Fri Feb 3 16:38:32 2006
@@ -22,9 +22,9 @@
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
// Lucene imports
import org.apache.lucene.document.Field;
@@ -50,7 +50,7 @@
public class LanguageIndexingFilter implements IndexingFilter {
- private NutchConf nutchConf;
+ private Configuration conf;
private LanguageIdentifier languageIdentifier;
/**
@@ -93,12 +93,12 @@
return doc;
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
this.languageIdentifier = new LanguageIdentifier(conf);
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java Fri Feb 3 16:38:32 2006
@@ -17,22 +17,22 @@
package org.apache.nutch.analysis.lang;
import org.apache.nutch.searcher.RawFieldQueryFilter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
/** Handles "lang:" query clauses, causing them to search the "lang" field
* indexed by LanguageIdentifier. */
public class LanguageQueryFilter extends RawFieldQueryFilter {
- private NutchConf nutchConf;
+ private Configuration conf;
public LanguageQueryFilter() {
super("lang");
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java Fri Feb 3 16:38:32 2006
@@ -35,7 +35,7 @@
import java.util.logging.Logger;
// Nutch imports
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
// Lucene imports
import org.apache.lucene.analysis.Token;
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Fri Feb 3 16:38:32 2006
@@ -26,7 +26,8 @@
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ContentProperties;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
public class TestHTMLLanguageParser extends TestCase {
@@ -53,7 +54,7 @@
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
- Parser parser = new ParserFactory(new NutchConf()).getParser("text/html", URL);
+ Parser parser = new ParserFactory(NutchConfiguration.create()).getParser("text/html", URL);
Parse parse = parser.getParse(content);
assertEquals(metalanguages[t], (String) parse.getData().get(
@@ -126,7 +127,7 @@
ContentProperties p = new ContentProperties();
p.put("Content-Type", "text/html");
- Content content = new Content(URL, BASE, text.getBytes(), "text/html", p, new NutchConf());
+ Content content = new Content(URL, BASE, text.getBytes(), "text/html", p, NutchConfiguration.create());
return content;
}
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java Fri Feb 3 16:38:32 2006
@@ -32,8 +32,9 @@
// Lucene imports
import org.apache.lucene.analysis.Token;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
/**
* JUnit based test of class {@link LanguageIdentifier}.
@@ -205,7 +206,7 @@
public void testIdentify() {
try {
long total = 0;
- LanguageIdentifier idfr = new LanguageIdentifier(new NutchConf());
+ LanguageIdentifier idfr = new LanguageIdentifier(NutchConfiguration.create());
BufferedReader in = new BufferedReader(new InputStreamReader(
this.getClass().getResourceAsStream("test-referencial.txt")));
String line = null;
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Feb 3 16:38:32 2006
@@ -26,14 +26,14 @@
import java.util.logging.Logger;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.UTF8;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
/**
@@ -111,7 +111,7 @@
private Logger logger = LOGGER;
/** The nutch configuration */
- private NutchConf conf = null;
+ private Configuration conf = null;
/** Creates a new instance of HttpBase */
@@ -128,7 +128,7 @@
}
// Inherited Javadoc
- public void setConf(NutchConf conf) {
+ public void setConf(Configuration conf) {
this.conf = conf;
this.proxyHost = conf.get("http.proxy.host");
this.proxyPort = conf.getInt("http.proxy.port", 8080);
@@ -145,7 +145,7 @@
}
// Inherited Javadoc
- public NutchConf getConf() {
+ public Configuration getConf() {
return this.conf;
}
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Fri Feb 3 16:38:32 2006
@@ -29,9 +29,9 @@
import java.util.logging.Logger;
// Nutch imports
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.NutchConfigurable;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.util.LogFormatter;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
@@ -46,7 +46,7 @@
* @author Mike Cafarella
* @author Doug Cutting
*/
-public class RobotRulesParser implements NutchConfigurable {
+public class RobotRulesParser implements Configurable {
public static final Logger LOG=
LogFormatter.getLogger(RobotRulesParser.class.getName());
@@ -62,7 +62,7 @@
private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();
- private NutchConf conf;
+ private Configuration conf;
private HashMap robotNames;
/**
@@ -176,16 +176,16 @@
RobotRulesParser() { }
- public RobotRulesParser(NutchConf conf) {
+ public RobotRulesParser(Configuration conf) {
setConf(conf);
}
/* ---------------------------------- *
- * <implementation:NutchConfigurable> *
+ * <implementation:Configurable> *
* ---------------------------------- */
- public void setConf(NutchConf conf) {
+ public void setConf(Configuration conf) {
this.conf = conf;
allowForbidden = conf.getBoolean("http.robots.403.allow", false);
//
@@ -215,12 +215,12 @@
setRobotNames((String[]) agents.toArray(new String[agents.size()]));
}
- public NutchConf getConf() {
+ public Configuration getConf() {
return conf;
}
/* ---------------------------------- *
- * <implementation:NutchConfigurable> *
+ * <implementation:Configurable> *
* ---------------------------------- */
private void setRobotNames(String[] robotNames) {
Modified: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java Fri Feb 3 16:38:32 2006
@@ -16,8 +16,9 @@
package org.apache.nutch.ontology;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.util.NutchConfiguration;
import com.hp.hpl.jena.ontology.Individual;
import com.hp.hpl.jena.ontology.OntClass;
@@ -325,10 +326,10 @@
public static void main( String[] args ) throws Exception {
- NutchConf nutchConf = new NutchConf();
- Ontology ontology = new OntologyFactory(nutchConf).getOntology();
+ Configuration conf = NutchConfiguration.create();
+ Ontology ontology = new OntologyFactory(conf).getOntology();
- String urls = nutchConf.get("extension.ontology.urls");
+ String urls = conf.get("extension.ontology.urls");
if (urls==null || urls.trim().equals("")) {
LOG.severe("No ontology url found.");
return;
Modified: lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java Fri Feb 3 16:38:32 2006
@@ -21,7 +21,7 @@
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-//import org.apache.nutch.util.LogFormatter;
+//import org.apache.hadoop.util.LogFormatter;
import com.hp.hpl.jena.ontology.OntClass;
import com.hp.hpl.jena.ontology.OntModel;
Modified: lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java (original)
+++ lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java Fri Feb 3 16:38:32 2006
@@ -25,7 +25,8 @@
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
@@ -51,13 +52,13 @@
private String[] sampleFiles = {"time.owl"};
private static Ontology ontology;
- private NutchConf nutchConf;
+ private Configuration conf;
public TestOntology(String name) {
super(name);
}
protected void setUp() {
- this.nutchConf = new NutchConf();
+ this.conf = NutchConfiguration.create();
}
protected void tearDown() {}
@@ -69,7 +70,7 @@
if (ontology==null) {
try {
- ontology = new OntologyFactory(this.nutchConf).getOntology();
+ ontology = new OntologyFactory(this.conf).getOntology();
} catch (Exception e) {
throw new Exception("Failed to instantiate ontology");
}