You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/01/31 17:13:17 UTC
svn commit: r373853 [1/6] - in /lucene/nutch/trunk/src:
java/org/apache/nutch/analysis/ java/org/apache/nutch/clustering/
java/org/apache/nutch/crawl/ java/org/apache/nutch/fetcher/
java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apac...
Author: ab
Date: Tue Jan 31 08:08:58 2006
New Revision: 373853
URL: http://svn.apache.org/viewcvs?rev=373853&view=rev
Log:
Apply patches from NUTCH-169 (remove static NutchConf).
Submitted by: Marko Bauhardt, Stefan Groschupf, Jerome Charron.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/FileUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/LocalFileSystem.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSFileSystem.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSShell.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataInputStream.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataOutputStream.java
lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/NdfsDirectory.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/ArrayFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/MapFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/ObjectWritable.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/SequenceFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/SetFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Server.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/CombiningCollector.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/JobClient.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/JobConf.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/JobTracker.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/LocalJobRunner.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/MapOutputFile.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/MapTask.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/MapTaskRunner.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/ReduceTask.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/SequenceFileInputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/SequenceFileRecordReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/Task.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/demo/Grep.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/DataNode.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSConstants.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSNamesystem.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/NDFSClient.java
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/NameNode.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/BasicUrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/RegexUrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlNormalizerFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/ontology/OntologyFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/TypeQueryFilter.java
lucene/nutch/trunk/src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteQueryFilter.java
lucene/nutch/trunk/src/plugin/query-url/src/java/org/apache/nutch/searcher/url/URLQueryFilter.java
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/PrefixURLFilter.java
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
lucene/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
lucene/nutch/trunk/src/test/org/apache/nutch/io/TestArrayFile.java
lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java
lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSetFile.java
lucene/nutch/trunk/src/test/org/apache/nutch/io/TestWritable.java
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java
lucene/nutch/trunk/src/test/org/apache/nutch/mapred/TestSequenceFileInputFormat.java
lucene/nutch/trunk/src/test/org/apache/nutch/mapred/TestTextInputFormat.java
lucene/nutch/trunk/src/test/org/apache/nutch/ndfs/TestNDFS.java
lucene/nutch/trunk/src/test/org/apache/nutch/net/TestBasicUrlNormalizer.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContentProperties.java
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestQuery.java
lucene/nutch/trunk/src/web/jsp/anchors.jsp
lucene/nutch/trunk/src/web/jsp/cached.jsp
lucene/nutch/trunk/src/web/jsp/explain.jsp
lucene/nutch/trunk/src/web/jsp/refine-query-init.jsp
lucene/nutch/trunk/src/web/jsp/search.jsp
lucene/nutch/trunk/src/web/jsp/text.jsp
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Tue Jan 31 08:08:58 2006
@@ -26,6 +26,7 @@
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
/**
@@ -38,25 +39,21 @@
public final static Logger LOG =
LogFormatter.getLogger(AnalyzerFactory.class.getName());
- private final static ExtensionPoint X_POINT =
- PluginRepository.getInstance()
- .getExtensionPoint(NutchAnalyzer.X_POINT_ID);
- private final static Map CACHE = new HashMap();
-
- private final static NutchAnalyzer DEFAULT_ANALYZER =
- new NutchDocumentAnalyzer();
-
+ private NutchAnalyzer DEFAULT_ANALYZER;
- static {
- if (X_POINT == null) {
- throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
- " not found.");
- }
- }
-
+ private ExtensionPoint extensionPoint;
+ private NutchConf nutchConf;
- private AnalyzerFactory() {}
+ public AnalyzerFactory (NutchConf nutchConf) {
+ DEFAULT_ANALYZER = new NutchDocumentAnalyzer(nutchConf);
+ this.nutchConf = nutchConf;
+ this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+ if(this.extensionPoint == null) {
+ throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
+ " not found.");
+ }
+ }
/**
@@ -67,7 +64,7 @@
* plugin found whose "lang" attribute equals the specified lang parameter is
* used. If none match, then the {@link NutchDocumentAnalyzer} is used.
*/
- public static NutchAnalyzer get(String lang) {
+ public NutchAnalyzer get(String lang) {
NutchAnalyzer analyzer = DEFAULT_ANALYZER;
Extension extension = getExtension(lang);
@@ -81,20 +78,20 @@
return analyzer;
}
- private static Extension getExtension(String lang) {
+ private Extension getExtension(String lang) {
- Extension extension = (Extension) CACHE.get(lang);
+ Extension extension = (Extension) this.nutchConf.getObject(lang);
if (extension == null) {
extension = findExtension(lang);
- CACHE.put(lang, extension);
+ this.nutchConf.setObject(lang, extension);
}
return extension;
}
- private static Extension findExtension(String lang) {
+ private Extension findExtension(String lang) {
if (lang != null) {
- Extension[] extensions = X_POINT.getExtentens();
+ Extension[] extensions = this.extensionPoint.getExtensions();
for (int i=0; i<extensions.length; i++) {
if (lang.equals(extensions[i].getAttribute("lang"))) {
return extensions[i];
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Tue Jan 31 08:08:58 2006
@@ -37,11 +37,15 @@
private static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.analysis.CommonGrams");
private static final char SEPARATOR = '-';
- private static final HashMap COMMON_TERMS = new HashMap();
-
- static { init(); }
-
- private CommonGrams() {} // no public ctor
+ private HashMap COMMON_TERMS = new HashMap();
+
+ /**
+ * The constructor.
+ * @param nutchConf
+ */
+ public CommonGrams(NutchConf nutchConf) {
+ init(nutchConf);
+ }
private static class Filter extends TokenFilter {
private HashSet common;
@@ -130,10 +134,10 @@
}
/** Construct using the provided config file. */
- private static void init() {
+ private void init(NutchConf nutchConf) {
try {
- Reader reader = NutchConf.get().getConfResourceAsReader
- (NutchConf.get().get("analysis.common.terms.file"));
+ Reader reader = nutchConf.getConfResourceAsReader
+ (nutchConf.get("analysis.common.terms.file"));
BufferedReader in = new BufferedReader(reader);
String line;
while ((line = in.readLine()) != null) {
@@ -170,7 +174,7 @@
/** Construct a token filter that inserts n-grams for common terms. For use
* while indexing documents. */
- public static TokenFilter getFilter(TokenStream ts, String field) {
+ public TokenFilter getFilter(TokenStream ts, String field) {
return new Filter(ts, (HashSet)COMMON_TERMS.get(field));
}
@@ -179,8 +183,10 @@
private Term[] terms;
private int index;
- public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); }
-
+ public ArrayTokens(Phrase phrase) {
+ this.terms = phrase.getTerms();
+ }
+
public Token next() {
if (index == terms.length)
return null;
@@ -190,7 +196,7 @@
}
/** Optimizes phrase queries to use n-grams when possible. */
- public static String[] optimizePhrase(Phrase phrase, String field) {
+ public String[] optimizePhrase(Phrase phrase, String field) {
//LOG.info("Optimizing " + phrase + " for " + field);
ArrayList result = new ArrayList();
TokenStream ts = getFilter(new ArrayTokens(phrase), field);
@@ -211,17 +217,10 @@
if (prev != null)
result.add(prev.termText());
-// LOG.info("Optimized: ");
-// for (int i = 0; i < result.size(); i++) {
-// LOG.info(result.get(i) + " ");
-// }
-
return (String[])result.toArray(new String[result.size()]);
-
-
}
- private static int arity(String gram) {
+ private int arity(String gram) {
int index = 0;
int arity = 0;
while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
@@ -237,14 +236,14 @@
text.append(args[i]);
text.append(' ');
}
- TokenStream ts =
- new NutchDocumentTokenizer(new StringReader(text.toString()));
- ts = getFilter(ts, "url");
+ TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
+ CommonGrams commonGrams = new CommonGrams(new NutchConf());
+ ts = commonGrams.getFilter(ts, "url");
Token token;
while ((token = ts.next()) != null) {
System.out.println("Token: " + token);
}
- String[] optimized = optimizePhrase(new Phrase(args), "url");
+ String[] optimized = commonGrams.optimizePhrase(new Phrase(args), "url");
System.out.print("Optimized: ");
for (int i = 0; i < optimized.length; i++) {
System.out.print(optimized[i] + " ");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java Tue Jan 31 08:08:58 2006
@@ -4,7 +4,8 @@
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
-
+import org.apache.nutch.searcher.Query.Clause;
+import org.apache.nutch.util.NutchConf;
import org.apache.lucene.analysis.StopFilter;
import java.io.*;
@@ -24,6 +25,8 @@
private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);
private String queryString;
+ private QueryFilters queryFilters;
+
/** True iff word is a stop word. Stop words are only removed from queries.
* Every word is indexed. */
@@ -32,11 +35,12 @@
}
/** Construct a query parser for the text in a reader. */
- public static Query parseQuery(String queryString) throws IOException {
+ public static Query parseQuery(String queryString, NutchConf nutchConf) throws IOException {
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
parser.queryString = queryString;
- return parser.parse();
+ parser.queryFilters = new QueryFilters(nutchConf);
+ return parser.parse(nutchConf);
}
/** For debugging. */
@@ -45,13 +49,13 @@
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- System.out.println(parseQuery(line));
+ System.out.println(parseQuery(line, new NutchConf()));
}
}
/** Parse a query. */
- final public Query parse() throws ParseException {
- Query query = new Query();
+ final public Query parse(NutchConf nutchConf) throws ParseException {
+ Query query = new Query(nutchConf);
ArrayList terms;
Token token;
String field;
@@ -213,7 +217,7 @@
jj_consume_token(-1);
throw new ParseException();
}
- if (QueryFilters.isRawField(field)) {
+ if (this.queryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, end));
}
@@ -259,7 +263,7 @@
term = term();
result.add(term);
}
- if (QueryFilters.isRawField(field)) {
+ if (this.queryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, token.endColumn));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.jj Tue Jan 31 08:08:58 2006
@@ -31,7 +31,8 @@
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
-
+import org.apache.nutch.searcher.Query.Clause;
+import org.apache.nutch.util.NutchConf;
import org.apache.lucene.analysis.StopFilter;
import java.io.*;
@@ -51,6 +52,8 @@
private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS);
private String queryString;
+ private QueryFilters queryFilters;
+
/** True iff word is a stop word. Stop words are only removed from queries.
* Every word is indexed. */
@@ -59,11 +62,12 @@
}
/** Construct a query parser for the text in a reader. */
- public static Query parseQuery(String queryString) throws IOException {
+ public static Query parseQuery(String queryString, NutchConf nutchConf) throws IOException {
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
parser.queryString = queryString;
- return parser.parse();
+ parser.queryFilters = new QueryFilters(nutchConf);
+ return parser.parse(nutchConf);
}
/** For debugging. */
@@ -72,7 +76,7 @@
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- System.out.println(parseQuery(line));
+ System.out.println(parseQuery(line, new NutchConf()));
}
}
@@ -174,9 +178,9 @@
/** Parse a query. */
-Query parse() :
+Query parse(NutchConf nutchConf) :
{
- Query query = new Query();
+ Query query = new Query(nutchConf);
ArrayList terms;
Token token;
String field;
@@ -245,7 +249,7 @@
(<QUOTE>|<EOF>)
{
- if (QueryFilters.isRawField(field)) {
+ if (this.queryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, end));
}
@@ -272,7 +276,7 @@
term = term() { result.add(term); })*
{
- if (QueryFilters.isRawField(field)) {
+ if (this.queryFilters.isRawField(field)) {
result.clear();
result.add(queryString.substring(start, token.endColumn));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java Tue Jan 31 08:08:58 2006
@@ -3,6 +3,8 @@
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
+import org.apache.nutch.searcher.Query.Clause;
+import org.apache.nutch.util.NutchConf;
import org.apache.lucene.analysis.StopFilter;
import java.io.*;
import java.util.*;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Tue Jan 31 08:08:58 2006
@@ -24,40 +24,60 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
-
+import org.apache.nutch.mapred.JobConf;
+import org.apache.nutch.util.NutchConf;
/**
- * The analyzer used for Nutch documents.
- * Uses the JavaCC-defined lexical analyzer {@link NutchDocumentTokenizer},
- * with no stop list. This keeps it consistent with query parsing.
+ * The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
+ * analyzer {@link NutchDocumentTokenizer}, with no stop list. This keeps it
+ * consistent with query parsing.
*/
public class NutchDocumentAnalyzer extends NutchAnalyzer {
/** Analyzer used to index textual content. */
- private static class ContentAnalyzer extends Analyzer {
- /** Constructs a {@link NutchDocumentTokenizer}. */
- public TokenStream tokenStream(String field, Reader reader) {
- return CommonGrams.getFilter(new NutchDocumentTokenizer(reader), field);
- }
+ private static Analyzer CONTENT_ANALYZER;
+ // Anchor Analysis
+ // Like content analysis, but leave gap between anchors to inhibit
+ // cross-anchor phrase matching.
+ /**
+ * The number of unused term positions between anchors in the anchor field.
+ */
+ public static final int INTER_ANCHOR_GAP = 4;
+ /** Analyzer used to analyze anchors. */
+ private static Analyzer ANCHOR_ANALYZER;
+ private NutchConf nutchConf;
+
+ /**
+ * @param conf
+ */
+ public NutchDocumentAnalyzer(NutchConf conf) {
+ this.nutchConf = conf;
+ CONTENT_ANALYZER = new ContentAnalyzer(conf);
+ ANCHOR_ANALYZER = new AnchorAnalyzer();
}
/** Analyzer used to index textual content. */
- public static final Analyzer CONTENT_ANALYZER = new ContentAnalyzer();
+ private static class ContentAnalyzer extends Analyzer {
+ private CommonGrams commonGrams;
- // Anchor Analysis
- // Like content analysis, but leave gap between anchors to inhibit
- // cross-anchor phrase matching.
+ public ContentAnalyzer(NutchConf nutchConf) {
+ this.commonGrams = new CommonGrams(nutchConf);
+ }
- /** The number of unused term positions between anchors in the anchor
- * field. */
- public static final int INTER_ANCHOR_GAP = 4;
+ /** Constructs a {@link NutchDocumentTokenizer}. */
+ public TokenStream tokenStream(String field, Reader reader) {
+ return this.commonGrams.getFilter(new NutchDocumentTokenizer(reader),
+ field);
+ }
+ }
private static class AnchorFilter extends TokenFilter {
+ private boolean first = true;
+
public AnchorFilter(TokenStream input) {
super(input);
}
- private boolean first = true;
public final Token next() throws IOException {
Token result = input.next();
if (result == null)
@@ -76,9 +96,6 @@
}
}
- /** Analyzer used to analyze anchors. */
- public static final Analyzer ANCHOR_ANALYZER = new AnchorAnalyzer();
-
/** Returns a new token stream for text from the named field. */
public TokenStream tokenStream(String fieldName, Reader reader) {
Analyzer analyzer;
@@ -89,5 +106,4 @@
return analyzer.tokenStream(fieldName, reader);
}
-
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java Tue Jan 31 08:08:58 2006
@@ -30,11 +30,13 @@
public class OnlineClustererFactory {
public static final Logger LOG = LogFormatter
.getLogger(OnlineClustererFactory.class.getName());
+ private ExtensionPoint extensionPoint;
+ private String extensionName;
- private final static ExtensionPoint X_POINT = PluginRepository.getInstance()
- .getExtensionPoint(OnlineClusterer.X_POINT_ID);
-
- private OnlineClustererFactory() {}
+ public OnlineClustererFactory(NutchConf nutchConf) {
+ this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(OnlineClusterer.X_POINT_ID);
+ this.extensionName = nutchConf.get("extension.clustering.extension-name");
+ }
/**
* @return Returns the online clustering extension specified
@@ -43,15 +45,14 @@
* empty (no preference), the first available clustering extension is
* returned.
*/
- public static OnlineClusterer getOnlineClusterer()
+ public OnlineClusterer getOnlineClusterer()
throws PluginRuntimeException {
- if (X_POINT == null) {
+ if (this.extensionPoint == null) {
// not even an extension point defined.
return null;
}
-
- String extensionName = NutchConf.get().get("extension.clustering.extension-name");
+
if (extensionName != null) {
Extension extension = findExtension(extensionName);
if (extension != null) {
@@ -63,7 +64,7 @@
// not found, fallback to the default, if available.
}
- Extension[] extensions = X_POINT.getExtensions();
+ Extension[] extensions = this.extensionPoint.getExtensions();
if (extensions.length > 0) {
LOG.info("Using the first clustering extension found: "
+ extensions[0].getId());
@@ -73,10 +74,10 @@
}
}
- private static Extension findExtension(String name)
+ private Extension findExtension(String name)
throws PluginRuntimeException {
- Extension[] extensions = X_POINT.getExtensions();
+ Extension[] extensions = this.extensionPoint.getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Tue Jan 31 08:08:58 2006
@@ -41,9 +41,6 @@
(new Date(System.currentTimeMillis()));
}
- static {
- NutchConf.get().addConfResource("crawl-tool.xml");
- }
/* Perform complete crawling and indexing given a set of root urls. */
public static void main(String args[]) throws Exception {
@@ -53,8 +50,9 @@
return;
}
- JobConf conf = new JobConf(NutchConf.get());
- //conf.addConfResource("crawl-tool.xml");
+ NutchConf nutchConf = new NutchConf();
+ nutchConf.addConfResource("crawl-tool.xml");
+ JobConf conf = new JobConf(nutchConf);
File rootUrlDir = null;
File dir = new File("crawl-" + getDate());
@@ -120,7 +118,7 @@
// index, dedup & merge
new Indexer(conf).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
new DeleteDuplicates(conf).dedup(new File[] { indexes });
- new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge();
+ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, nutchConf).merge();
LOG.info("crawl finished: " + dir);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Tue Jan 31 08:08:58 2006
@@ -89,7 +89,7 @@
}
public static void main(String[] args) throws Exception {
- CrawlDb crawlDb = new CrawlDb(NutchConf.get());
+ CrawlDb crawlDb = new CrawlDb(new NutchConf());
if (args.length < 2) {
System.err.println("Usage: <crawldb> <segment>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Jan 31 08:08:58 2006
@@ -157,8 +157,8 @@
JobClient.runJob(job);
// reading the result
- NutchFileSystem fileSystem = NutchFileSystem.get();
- SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(fileSystem, tmpFolder);
+ NutchFileSystem fileSystem = NutchFileSystem.get(config);
+ SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);
UTF8 key = new UTF8();
LongWritable value = new LongWritable();
@@ -210,7 +210,7 @@
NutchFileSystem fs = NutchFileSystem.get(config);
UTF8 key = new UTF8(url);
CrawlDatum val = new CrawlDatum();
- MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new File(crawlDb, CrawlDatum.DB_DIR_NAME));
+ MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new File(crawlDb, CrawlDatum.DB_DIR_NAME), config);
Writable res = MapFileOutputFormat.getEntry(readers, new HashPartitioner(), key, val);
System.out.println("URL: " + url);
if (res != null) {
@@ -254,7 +254,7 @@
}
String param = null;
String crawlDb = args[0];
- NutchConf conf = NutchConf.get();
+ NutchConf conf = new NutchConf();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
dbr.processStatJob(crawlDb, conf);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Jan 31 08:08:58 2006
@@ -259,7 +259,7 @@
if (topN != Long.MAX_VALUE)
LOG.info("topN: " + topN);
- Generator gen = new Generator(NutchConf.get());
+ Generator gen = new Generator(new NutchConf());
gen.generate(dbDir, segmentsDir, numFetchers, topN, curTime);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Tue Jan 31 08:08:58 2006
@@ -35,11 +35,14 @@
/** Normalize and filter injected urls. */
public static class InjectMapper implements Mapper {
- private UrlNormalizer urlNormalizer = UrlNormalizerFactory.getNormalizer();
+ private UrlNormalizer urlNormalizer;
private float interval;
+ private JobConf jobConf;
public void configure(JobConf job) {
+ urlNormalizer = new UrlNormalizerFactory(job).getNormalizer();
interval = job.getFloat("db.default.fetch.interval", 30f);
+ this.jobConf = job;
}
public void map(WritableComparable key, Writable val,
@@ -47,9 +50,11 @@
throws IOException {
UTF8 value = (UTF8)val;
String url = value.toString(); // value is line of text
+ // System.out.println("url: " +url);
try {
url = urlNormalizer.normalize(url); // normalize the url
- url = URLFilters.filter(url); // filter the url
+ URLFilters filters = new URLFilters(this.jobConf);
+ url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warning("Skipping " +url+":"+e);
url = null;
@@ -116,7 +121,7 @@
}
public static void main(String[] args) throws Exception {
- Injector injector = new Injector(NutchConf.get());
+ Injector injector = new Injector(new NutchConf());
if (args.length < 2) {
System.err.println("Usage: Injector <crawldb> <url_dir>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Tue Jan 31 08:08:58 2006
@@ -182,7 +182,7 @@
}
public static void main(String[] args) throws Exception {
- LinkDb linkDb = new LinkDb(NutchConf.get());
+ LinkDb linkDb = new LinkDb(new NutchConf());
if (args.length < 2) {
System.err.println("Usage: <linkdb> <segments>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Tue Jan 31 08:08:58 2006
@@ -37,10 +37,12 @@
private NutchFileSystem fs;
private File directory;
private MapFile.Reader[] readers;
+ private NutchConf nutchConf;
- public LinkDbReader(NutchFileSystem fs, File directory) {
+ public LinkDbReader(NutchFileSystem fs, File directory, NutchConf nutchConf) {
this.fs = fs;
this.directory = directory;
+ this.nutchConf = nutchConf;
}
public String[] getAnchors(UTF8 url) throws IOException {
@@ -55,7 +57,7 @@
synchronized (this) {
if (readers == null) {
readers = MapFileOutputFormat.getReaders
- (fs, new File(directory, LinkDb.CURRENT_NAME));
+ (fs, new File(directory, LinkDb.CURRENT_NAME), this.nutchConf);
}
}
@@ -90,11 +92,11 @@
System.err.println("\t-url <url>\tprint information about <url> to System.out");
return;
}
-
+ NutchConf nutchConf = new NutchConf();
if (args[1].equals("-dump")) {
- LinkDbReader.processDumpJob(args[0], args[2], NutchConf.get());
+ LinkDbReader.processDumpJob(args[0], args[2], nutchConf);
} else if (args[1].equals("-url")) {
- LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(), new File(args[0]));
+ LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(new NutchConf()), new File(args[0]), nutchConf);
Inlinks links = dbr.getInlinks(new UTF8(args[2]));
if (links == null) {
System.out.println(" - no link information.");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java Tue Jan 31 08:08:58 2006
@@ -157,7 +157,7 @@
public static void main(String[] args) throws Exception {
TextProfileSignature sig = new TextProfileSignature();
- sig.setConf(NutchConf.get());
+ sig.setConf(new NutchConf());
HashMap res = new HashMap();
File[] files = new File(args[0]).listFiles();
for (int i = 0; i < files.length; i++) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Jan 31 08:08:58 2006
@@ -73,9 +73,18 @@
private boolean parsing;
private class FetcherThread extends Thread {
- public FetcherThread() {
+ private NutchConf nutchConf;
+ private URLFilters urlFilters;
+ private ParseUtil parseUtil;
+ private ProtocolFactory protocolFactory;
+
+ public FetcherThread(NutchConf nutchConf) {
this.setDaemon(true); // don't hang JVM on exit
this.setName("FetcherThread"); // use an informative name
+ this.nutchConf = nutchConf;
+ this.urlFilters = new URLFilters(nutchConf);
+ this.parseUtil = new ParseUtil(nutchConf);
+ this.protocolFactory = new ProtocolFactory(nutchConf);
}
public void run() {
@@ -112,7 +121,7 @@
do {
redirecting = false;
LOG.fine("redirectCount=" + redirectCount);
- Protocol protocol = ProtocolFactory.getProtocol(url);
+ Protocol protocol = this.protocolFactory.getProtocol(url);
ProtocolOutput output = protocol.getProtocolOutput(key, datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
@@ -127,7 +136,7 @@
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
String newUrl = status.getMessage();
- newUrl = URLFilters.filter(newUrl);
+ newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url)) {
url = newUrl;
redirecting = true;
@@ -196,7 +205,7 @@
if (content == null) {
String url = key.toString();
- content = new Content(url, url, new byte[0], "", new ContentProperties());
+ content = new Content(url, url, new byte[0], "", new ContentProperties(), this.nutchConf);
}
content.getMetadata().setProperty // add segment to metadata
@@ -208,14 +217,14 @@
if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
ParseStatus parseStatus;
try {
- parse = ParseUtil.parse(content);
+ parse = this.parseUtil.parse(content);
parseStatus = parse.getData().getStatus();
} catch (Exception e) {
parseStatus = new ParseStatus(e);
}
if (!parseStatus.isSuccess()) {
LOG.warning("Error parsing: " + key + ": " + parseStatus);
- parse = parseStatus.getEmptyParse();
+ parse = parseStatus.getEmptyParse(getConf());
}
// Calculate page signature. For non-parsing fetchers this will
// be done in ParseSegment
@@ -295,7 +304,7 @@
LOG.info("Fetcher: threads: " + threadCount);
for (int i = 0; i < threadCount; i++) { // spawn threads
- new FetcherThread().start();
+ new FetcherThread(getConf()).start();
}
// select a timeout that avoids a task timeout
@@ -361,7 +370,7 @@
File segment = new File(args[0]);
- NutchConf conf = NutchConf.get();
+ NutchConf conf = new NutchConf();
int threads = conf.getInt("fetcher.threads.fetch", 10);
boolean parsing = true;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Tue Jan 31 08:08:58 2006
@@ -22,12 +22,15 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.*;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
/* An entry in the fetcher's output. */
-public final class FetcherOutput implements Writable {
+public final class FetcherOutput implements Writable, NutchConfigurable {
private CrawlDatum crawlDatum;
private Content content;
private ParseImpl parse;
+ private NutchConf nutchConf;
public FetcherOutput() {}
@@ -41,7 +44,7 @@
public final void readFields(DataInput in) throws IOException {
this.crawlDatum = CrawlDatum.read(in);
this.content = in.readBoolean() ? Content.read(in) : null;
- this.parse = in.readBoolean() ? ParseImpl.read(in) : null;
+ this.parse = in.readBoolean() ? ParseImpl.read(in, this.nutchConf) : null;
}
public final void write(DataOutput out) throws IOException {
@@ -75,6 +78,14 @@
StringBuffer buffer = new StringBuffer();
buffer.append("CrawlDatum: " + crawlDatum+"\n" );
return buffer.toString();
+ }
+
+ public void setConf(NutchConf conf) {
+ this.nutchConf = conf;
+ }
+
+ public NutchConf getConf() {
+ return this.nutchConf;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/FileUtil.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/FileUtil.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/FileUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/FileUtil.java Tue Jan 31 08:08:58 2006
@@ -28,8 +28,8 @@
* Delete a directory and all its contents. If
* we return false, the directory may be partially-deleted.
*/
- public static boolean fullyDelete(File dir) throws IOException {
- return fullyDelete(new LocalFileSystem(), dir);
+ public static boolean fullyDelete(File dir, NutchConf nutchConf) throws IOException {
+ return fullyDelete(new LocalFileSystem(nutchConf), dir);
}
public static boolean fullyDelete(NutchFileSystem nfs, File dir) throws IOException {
// 20041022, xing.
@@ -43,7 +43,7 @@
* Copy a file's contents to a new location.
* Returns whether a target file was overwritten
*/
- public static boolean copyContents(NutchFileSystem nfs, File src, File dst, boolean overwrite) throws IOException {
+ public static boolean copyContents(NutchFileSystem nfs, File src, File dst, boolean overwrite, NutchConf nutchConf) throws IOException {
if (nfs.exists(dst) && !overwrite) {
return false;
}
@@ -57,7 +57,7 @@
NFSInputStream in = nfs.openRaw(src);
try {
NFSOutputStream out = nfs.createRaw(dst, true);
- byte buf[] = new byte[NutchConf.get().getInt("io.file.buffer.size", 4096)];
+ byte buf[] = new byte[nutchConf.getInt("io.file.buffer.size", 4096)];
try {
int readBytes = in.read(buf);
@@ -77,7 +77,7 @@
if (contents != null) {
for (int i = 0; i < contents.length; i++) {
File newDst = new File(dst, contents[i].getName());
- if (! copyContents(nfs, contents[i], newDst, overwrite)) {
+ if (! copyContents(nfs, contents[i], newDst, overwrite, nutchConf)) {
return false;
}
}
@@ -90,7 +90,7 @@
* Copy a file and/or directory and all its contents (whether
* data or other files/dirs)
*/
- public static void recursiveCopy(NutchFileSystem nfs, File src, File dst) throws IOException {
+ public static void recursiveCopy(NutchFileSystem nfs, File src, File dst, NutchConf nutchConf) throws IOException {
//
// Resolve the real target.
//
@@ -107,7 +107,7 @@
//
// If the source is a file, then just copy the contents
//
- copyContents(nfs, src, dst, true);
+ copyContents(nfs, src, dst, true, nutchConf);
} else {
//
// If the source is a dir, then we need to copy all the subfiles.
@@ -115,7 +115,7 @@
nfs.mkdirs(dst);
File contents[] = nfs.listFiles(src);
for (int i = 0; i < contents.length; i++) {
- recursiveCopy(nfs, contents[i], new File(dst, contents[i].getName()));
+ recursiveCopy(nfs, contents[i], new File(dst, contents[i].getName()), nutchConf);
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/LocalFileSystem.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/LocalFileSystem.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/LocalFileSystem.java Tue Jan 31 08:08:58 2006
@@ -23,6 +23,7 @@
import org.apache.nutch.ndfs.NDFSFile;
import org.apache.nutch.ndfs.DF;
import org.apache.nutch.ndfs.NDFSFileInfo;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.io.UTF8;
/****************************************************************
@@ -38,11 +39,11 @@
TreeMap lockObjSet = new TreeMap();
// by default use copy/delete instead of rename
boolean useCopyForRename = true;
-
+
/**
*/
- public LocalFileSystem() throws IOException {
- super();
+ public LocalFileSystem(NutchConf nutchConf) throws IOException {
+ super(nutchConf);
// if you find an OS which reliably supports non-POSIX
// rename(2) across filesystems / volumes, you can
// uncomment this.
@@ -175,7 +176,7 @@
*/
public boolean renameRaw(File src, File dst) throws IOException {
if (useCopyForRename) {
- FileUtil.copyContents(this, src, dst, true);
+ FileUtil.copyContents(this, src, dst, true, nutchConf);
return fullyDelete(src);
} else return src.renameTo(dst);
}
@@ -288,7 +289,7 @@
public void moveFromLocalFile(File src, File dst) throws IOException {
if (! src.equals(dst)) {
if (useCopyForRename) {
- FileUtil.copyContents(this, src, dst, true);
+ FileUtil.copyContents(this, src, dst, true, this.nutchConf);
fullyDelete(src);
} else src.renameTo(dst);
}
@@ -299,7 +300,7 @@
*/
public void copyFromLocalFile(File src, File dst) throws IOException {
if (! src.equals(dst)) {
- FileUtil.copyContents(this, src, dst, true);
+ FileUtil.copyContents(this, src, dst, true, this.nutchConf);
}
}
@@ -308,7 +309,7 @@
*/
public void copyToLocalFile(File src, File dst) throws IOException {
if (! src.equals(dst)) {
- FileUtil.copyContents(this, src, dst, true);
+ FileUtil.copyContents(this, src, dst, true, this.nutchConf);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSFileSystem.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSFileSystem.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSFileSystem.java Tue Jan 31 08:08:58 2006
@@ -43,8 +43,9 @@
* Create the ShareSet automatically, and then go on to
* the regular constructor.
*/
- public NDFSFileSystem(InetSocketAddress namenode) throws IOException {
- this.ndfs = new NDFSClient(namenode);
+ public NDFSFileSystem(InetSocketAddress namenode, NutchConf nutchConf) throws IOException {
+ super(nutchConf);
+ this.ndfs = new NDFSClient(namenode, nutchConf);
this.name = namenode.getHostName() + ":" + namenode.getPort();
}
@@ -172,7 +173,7 @@
doFromLocalFile(contents[i], new File(dst, contents[i].getName()), deleteSource);
}
} else {
- byte buf[] = new byte[NutchConf.get().getInt("io.file.buffer.size", 4096)];
+ byte buf[] = new byte[this.nutchConf.getInt("io.file.buffer.size", 4096)];
InputStream in = new BufferedInputStream(new FileInputStream(src));
try {
OutputStream out = create(dst);
@@ -217,10 +218,10 @@
copyToLocalFile(contents[i], new File(dst, contents[i].getName()));
}
} else {
- byte buf[] = new byte[NutchConf.get().getInt("io.file.buffer.size", 4096)];
+ byte buf[] = new byte[this.nutchConf.getInt("io.file.buffer.size", 4096)];
InputStream in = open(src);
try {
- OutputStream out = NutchFileSystem.getNamed("local").create(dst);
+ OutputStream out = NutchFileSystem.getNamed("local", this.nutchConf).create(dst);
try {
int bytesRead = in.read(buf);
while (bytesRead >= 0) {
@@ -267,7 +268,7 @@
*/
public void completeLocalInput(File localFile) throws IOException {
// Get rid of the local copy - we don't need it anymore.
- FileUtil.fullyDelete(localFile);
+ FileUtil.fullyDelete(localFile, this.nutchConf);
}
/**
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSShell.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSShell.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSShell.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NDFSShell.java Tue Jan 31 08:08:58 2006
@@ -122,8 +122,8 @@
/**
* Copy an NDFS file
*/
- public void copy(String srcf, String dstf) throws IOException {
- if (FileUtil.copyContents(nfs, new File(srcf), new File(dstf), true)) {
+ public void copy(String srcf, String dstf, NutchConf nutchConf) throws IOException {
+ if (FileUtil.copyContents(nfs, new File(srcf), new File(dstf), true, nutchConf)) {
System.out.println("Copied " + srcf + " to " + dstf);
} else {
System.out.println("Copy failed");
@@ -224,8 +224,9 @@
return;
}
+ NutchConf nutchConf = new NutchConf();
int i = 0;
- NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
+ NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i, nutchConf);
try {
NDFSShell tc = new NDFSShell(nfs);
@@ -244,7 +245,7 @@
} else if ("-mv".equals(cmd)) {
tc.rename(argv[i++], argv[i++]);
} else if ("-cp".equals(cmd)) {
- tc.copy(argv[i++], argv[i++]);
+ tc.copy(argv[i++], argv[i++], nutchConf);
} else if ("-rm".equals(cmd)) {
tc.delete(argv[i++]);
} else if ("-du".equals(cmd)) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataInputStream.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataInputStream.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataInputStream.java Tue Jan 31 08:08:58 2006
@@ -40,7 +40,7 @@
private Checksum sum = new CRC32();
private int inSum;
- public Checker(NutchFileSystem fs, File file)
+ public Checker(NutchFileSystem fs, File file, NutchConf nutchConf)
throws IOException {
super(fs.openRaw(file));
@@ -48,7 +48,7 @@
this.file = file;
File sumFile = fs.getChecksumFile(file);
try {
- this.sums = new NFSDataInputStream(fs.openRaw(sumFile));
+ this.sums = new NFSDataInputStream(fs.openRaw(sumFile), nutchConf);
byte[] version = new byte[VERSION.length];
sums.readFully(version);
if (!Arrays.equals(version, VERSION))
@@ -210,20 +210,25 @@
}
}
-
- public NFSDataInputStream(NutchFileSystem fs, File file) throws IOException {
- this(fs, file, NutchConf.get().getInt("io.file.buffer.size", 4096));
+
+
+ public NFSDataInputStream(NutchFileSystem fs, File file, int bufferSize, NutchConf nutchConf)
+ throws IOException {
+ super(null);
+ this.in = new Buffer(new PositionCache(new Checker(fs, file, nutchConf)), bufferSize);
}
-
- public NFSDataInputStream(NutchFileSystem fs, File file, int bufferSize)
+
+
+ public NFSDataInputStream(NutchFileSystem fs, File file, NutchConf nutchConf)
throws IOException {
super(null);
- this.in = new Buffer(new PositionCache(new Checker(fs, file)), bufferSize);
+ int bufferSize = nutchConf.getInt("io.file.buffer.size", 4096);
+ this.in = new Buffer(new PositionCache(new Checker(fs, file, nutchConf)), bufferSize);
}
/** Construct without checksums. */
- public NFSDataInputStream(NFSInputStream in) throws IOException {
- this(in, NutchConf.get().getInt("io.file.buffer.size", 4096));
+ public NFSDataInputStream(NFSInputStream in, NutchConf nutchConf) throws IOException {
+ this(in, nutchConf.getInt("io.file.buffer.size", 4096));
}
/** Construct without checksums. */
public NFSDataInputStream(NFSInputStream in, int bufferSize)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataOutputStream.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataOutputStream.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataOutputStream.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NFSDataOutputStream.java Tue Jan 31 08:08:58 2006
@@ -29,29 +29,27 @@
/** Store checksums for data. */
private static class Summer extends FilterOutputStream {
- private final int bytesPerSum
- = NutchConf.get().getInt("io.bytes.per.checksum", 512);
-
private NFSDataOutputStream sums;
private Checksum sum = new CRC32();
private int inSum;
+ private int bytesPerSum;
- public Summer(NutchFileSystem fs, File file, boolean overwrite)
+ public Summer(NutchFileSystem fs, File file, boolean overwrite, NutchConf nutchConf)
throws IOException {
super(fs.createRaw(file, overwrite));
-
+ this.bytesPerSum = nutchConf.getInt("io.bytes.per.checksum", 512);
this.sums =
- new NFSDataOutputStream(fs.createRaw(fs.getChecksumFile(file), true));
+ new NFSDataOutputStream(fs.createRaw(fs.getChecksumFile(file), true), nutchConf);
sums.write(CHECKSUM_VERSION, 0, CHECKSUM_VERSION.length);
- sums.writeInt(bytesPerSum);
+ sums.writeInt(this.bytesPerSum);
}
public void write(byte b[], int off, int len) throws IOException {
int summed = 0;
while (summed < len) {
- int goal = bytesPerSum - inSum;
+ int goal = this.bytesPerSum - inSum;
int inBuf = len - summed;
int toSum = inBuf <= goal ? inBuf : goal;
@@ -59,7 +57,7 @@
summed += toSum;
inSum += toSum;
- if (inSum == bytesPerSum) {
+ if (inSum == this.bytesPerSum) {
writeSum();
}
}
@@ -124,15 +122,15 @@
}
public NFSDataOutputStream(NutchFileSystem fs, File file,
- boolean overwrite, int bufferSize)
+ boolean overwrite, NutchConf nutchConf)
throws IOException {
- super(new Buffer(new PositionCache(new Summer(fs, file, overwrite)),
- bufferSize));
+ super(new Buffer(new PositionCache(new Summer(fs, file, overwrite, nutchConf)),
+ nutchConf.getInt("io.file.buffer.size", 4096)));
}
/** Construct without checksums. */
- public NFSDataOutputStream(NFSOutputStream out) throws IOException {
- this(out, NutchConf.get().getInt("io.file.buffer.size", 4096));
+ public NFSDataOutputStream(NFSOutputStream out, NutchConf nutchConf) throws IOException {
+ this(out, nutchConf.getInt("io.file.buffer.size", 4096));
}
/** Construct without checksums. */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java Tue Jan 31 08:08:58 2006
@@ -45,7 +45,6 @@
public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.util.NutchFileSystem");
private static final HashMap NAME_TO_FS = new HashMap();
-
/**
* Parse the cmd-line args, starting at i. Remove consumed args
* from array. We expect param in the form:
@@ -53,7 +52,7 @@
*
* @deprecated use fs.default.name config option instead
*/
- public static NutchFileSystem parseArgs(String argv[], int i) throws IOException {
+ public static NutchFileSystem parseArgs(String argv[], int i, NutchConf nutchConf) throws IOException {
/**
if (argv.length - i < 1) {
throw new IOException("Must indicate filesystem type for NDFS");
@@ -65,12 +64,12 @@
if ("-ndfs".equals(cmd)) {
i++;
InetSocketAddress addr = DataNode.createSocketAddr(argv[i++]);
- nfs = new NDFSFileSystem(addr);
+ nfs = new NDFSFileSystem(addr, nutchConf);
} else if ("-local".equals(cmd)) {
i++;
- nfs = new LocalFileSystem();
+ nfs = new LocalFileSystem(nutchConf);
} else {
- nfs = get(); // using default
+ nfs = get(nutchConf); // using default
LOG.info("No FS indicated, using default:"+nfs.getName());
}
@@ -81,30 +80,26 @@
return nfs;
}
-
- /** Returns the default filesystem implementation.*/
- public static NutchFileSystem get() throws IOException {
- return get(NutchConf.get());
- }
-
/** Returns the configured filesystem implementation.*/
public static NutchFileSystem get(NutchConf conf) throws IOException {
- return getNamed(conf.get("fs.default.name", "local"));
+ return getNamed(conf.get("fs.default.name", "local"), conf);
}
+ protected NutchConf nutchConf;
/** Returns a name for this filesystem, suitable to pass to {@link
* NutchFileSystem#getNamed(String).*/
public abstract String getName();
/** Returns a named filesystem. Names are either the string "local" or a
* host:port pair, naming an NDFS name server.*/
- public static NutchFileSystem getNamed(String name) throws IOException {
+ public static NutchFileSystem getNamed(String name, NutchConf nutchConf) throws IOException {
NutchFileSystem fs = (NutchFileSystem)NAME_TO_FS.get(name);
+ int ioFileBufferSize = nutchConf.getInt("io.file.buffer.size", 4096);
if (fs == null) {
if ("local".equals(name)) {
- fs = new LocalFileSystem();
+ fs = new LocalFileSystem(nutchConf);
} else {
- fs = new NDFSFileSystem(DataNode.createSocketAddr(name));
+ fs = new NDFSFileSystem(DataNode.createSocketAddr(name), nutchConf);
}
NAME_TO_FS.put(name, fs);
}
@@ -127,7 +122,8 @@
///////////////////////////////////////////////////////////////
/**
*/
- public NutchFileSystem() {
+ public NutchFileSystem(NutchConf nutchConf) {
+ this.nutchConf = nutchConf;
}
/**
@@ -143,12 +139,16 @@
public abstract String[][] getFileCacheHints(File f, long start, long len) throws IOException;
/**
- * Opens an NFSDataInputStream for the indicated File.
+ * Opens an NFSDataInputStream at the indicated File.
+ * @param f the file name to open
+ * @param overwrite if a file with this name already exists, then if true,
+ * the file will be overwritten, and if false an error will be thrown.
+ * @param bufferSize the size of the buffer to be used.
*/
- public NFSDataInputStream open(File f) throws IOException {
- return open(f, NutchConf.get().getInt("io.file.buffer.size", 4096));
+ public NFSDataInputStream open(File f, int bufferSize) throws IOException {
+ return new NFSDataInputStream(this, f, bufferSize, this.nutchConf);
}
-
+
/**
* Opens an NFSDataInputStream at the indicated File.
* @param f the file name to open
@@ -156,8 +156,8 @@
* the file will be overwritten, and if false an error will be thrown.
* @param bufferSize the size of the buffer to be used.
*/
- public NFSDataInputStream open(File f, int bufferSize) throws IOException {
- return new NFSDataInputStream(this, f, bufferSize);
+ public NFSDataInputStream open(File f) throws IOException {
+ return new NFSDataInputStream(this, f, nutchConf);
}
/**
@@ -171,8 +171,7 @@
* Files are overwritten by default.
*/
public NFSDataOutputStream create(File f) throws IOException {
- return create(f, true,
- NutchConf.get().getInt("io.file.buffer.size", 4096));
+ return create(f, true,this.nutchConf.getInt("io.file.buffer.size", 4096));
}
/**
@@ -184,7 +183,7 @@
*/
public NFSDataOutputStream create(File f, boolean overwrite,
int bufferSize) throws IOException {
- return new NFSDataOutputStream(this, f, overwrite, bufferSize);
+ return new NFSDataOutputStream(this, f, overwrite, this.nutchConf);
}
/** Opens an OutputStream at the indicated File.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Tue Jan 31 08:08:58 2006
@@ -148,7 +148,7 @@
return new RecordReader() {
private IndexReader indexReader =
- IndexReader.open(new NdfsDirectory(fs, split.getFile(), false));
+ IndexReader.open(new NdfsDirectory(fs, split.getFile(), false, job));
{ indexReader.undeleteAll(); }
@@ -228,6 +228,7 @@
}
private NutchFileSystem fs;
+ private int ioFileBufferSize;
public DeleteDuplicates() { super(null); }
@@ -236,6 +237,7 @@
public void configure(JobConf job) {
try {
fs = NutchFileSystem.get(job);
+ this.ioFileBufferSize = job.getInt("io.file.buffer.size", 4096);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -254,7 +256,7 @@
OutputCollector output, Reporter reporter)
throws IOException {
File index = new File(key.toString());
- IndexReader reader = IndexReader.open(new NdfsDirectory(fs, index, false));
+ IndexReader reader = IndexReader.open(new NdfsDirectory(fs, index, false, getConf()));
try {
while (values.hasNext()) {
reader.delete(((IntWritable)values.next()).get());
@@ -316,6 +318,7 @@
job.setInputKeyClass(HashScore.class);
job.setInputValueClass(IndexDoc.class);
+ job.setInt("io.file.buffer.size", 4096);
job.setMapperClass(DeleteDuplicates.class);
job.setReducerClass(DeleteDuplicates.class);
@@ -331,7 +334,7 @@
}
public static void main(String[] args) throws Exception {
- DeleteDuplicates dedup = new DeleteDuplicates(NutchConf.get());
+ DeleteDuplicates dedup = new DeleteDuplicates(new NutchConf());
if (args.length < 1) {
System.err.println("Usage: <indexes> ...");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Tue Jan 31 08:08:58 2006
@@ -41,28 +41,21 @@
public static final String DONE_NAME = "merge.done";
- private int MERGE_FACTOR = NutchConf.get().getInt("indexer.mergeFactor",
- IndexWriter.DEFAULT_MERGE_FACTOR);
- private int MIN_MERGE_DOCS = NutchConf.get().getInt("indexer.minMergeDocs",
- IndexWriter.DEFAULT_MIN_MERGE_DOCS);
- private int MAX_MERGE_DOCS = NutchConf.get().getInt("indexer.maxMergeDocs",
- IndexWriter.DEFAULT_MAX_MERGE_DOCS);
- private int TERM_INDEX_INTERVAL =
- NutchConf.get().getInt("indexer.termIndexInterval",
- IndexWriter.DEFAULT_TERM_INDEX_INTERVAL);
private NutchFileSystem nfs;
private File outputIndex;
private File localWorkingDir;
private File[] indexes;
+ private NutchConf nutchConf;
/**
* Merge all of the indexes given
*/
- public IndexMerger(NutchFileSystem nfs, File[] indexes, File outputIndex, File localWorkingDir) throws IOException {
+ public IndexMerger(NutchFileSystem nfs, File[] indexes, File outputIndex, File localWorkingDir, NutchConf nutchConf) throws IOException {
this.nfs = nfs;
this.indexes = indexes;
this.outputIndex = outputIndex;
this.localWorkingDir = localWorkingDir;
+ this.nutchConf = nutchConf;
}
/**
@@ -81,7 +74,7 @@
Directory[] dirs = new Directory[indexes.length];
for (int i = 0; i < indexes.length; i++) {
LOG.info("Adding " + indexes[i]);
- dirs[i] = new NdfsDirectory(nfs, indexes[i], false);
+ dirs[i] = new NdfsDirectory(nfs, indexes[i], false, this.nutchConf);
}
//
@@ -90,10 +83,10 @@
// Merge indices
//
IndexWriter writer = new IndexWriter(localOutput, null, true);
- writer.mergeFactor = MERGE_FACTOR;
- writer.minMergeDocs = MIN_MERGE_DOCS;
- writer.maxMergeDocs = MAX_MERGE_DOCS;
- writer.setTermIndexInterval(TERM_INDEX_INTERVAL);
+ writer.mergeFactor = nutchConf.getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR);
+ writer.minMergeDocs = nutchConf.getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MIN_MERGE_DOCS);
+ writer.maxMergeDocs = nutchConf.getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS);
+ writer.setTermIndexInterval(nutchConf.getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
writer.infoStream = LogFormatter.getLogStream(LOG, Level.FINE);
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
@@ -121,7 +114,8 @@
//
// Parse args, read all index directories to be processed
//
- NutchFileSystem nfs = NutchFileSystem.get();
+ NutchConf nutchConf = new NutchConf();
+ NutchFileSystem nfs = NutchFileSystem.get(nutchConf);
File workDir = new File(new File("").getCanonicalPath());
List indexDirs = new ArrayList();
@@ -146,12 +140,12 @@
File[] indexFiles = (File[])indexDirs.toArray(new File[indexDirs.size()]);
if (workDir.exists()) {
- FileUtil.fullyDelete(workDir);
+ FileUtil.fullyDelete(workDir, nutchConf);
}
workDir.mkdirs();
- IndexMerger merger = new IndexMerger(nfs,indexFiles,outputIndex,workDir);
+ IndexMerger merger = new IndexMerger(nfs,indexFiles,outputIndex,workDir, nutchConf);
merger.merge();
LOG.info("done merging");
- FileUtil.fullyDelete(workDir);
+ FileUtil.fullyDelete(workDir, nutchConf);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Tue Jan 31 08:08:58 2006
@@ -225,14 +225,14 @@
this.directory = directory;
}
- public void sort() throws IOException {
+ public void sort(int termIndexInterval) throws IOException {
IndexReader reader = IndexReader.open(new File(directory, "index"));
SortingReader sorter = new SortingReader(reader, oldToNew(reader));
IndexWriter writer = new IndexWriter(new File(directory, "index-sorted"),
null, true);
writer.setTermIndexInterval
- (NutchConf.get().getInt("indexer.termIndexInterval", 128));
+ (termIndexInterval);
writer.setUseCompoundFile(false);
writer.addIndexes(new IndexReader[] { sorter });
writer.close();
@@ -283,8 +283,9 @@
IndexSorter sorter = new IndexSorter(directory);
Date start = new Date();
-
- sorter.sort();
+ NutchConf nutchConf = new NutchConf();
+ int termIndexInterval = nutchConf.getInt("indexer.termIndexInterval", 128);
+ sorter.sort(termIndexInterval);
Date end = new Date();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Tue Jan 31 08:08:58 2006
@@ -55,7 +55,7 @@
reporter.setStatus(split.toString());
- return new SequenceFileRecordReader(fs, split) {
+ return new SequenceFileRecordReader(job, split) {
public synchronized boolean next(Writable key, Writable value)
throws IOException {
ObjectWritable wrapper = (ObjectWritable)value;
@@ -83,7 +83,7 @@
final IndexWriter writer = // build locally first
new IndexWriter(fs.startLocalOutput(perm, temp),
- new NutchDocumentAnalyzer(), true);
+ new NutchDocumentAnalyzer(job), true);
writer.mergeFactor = job.getInt("indexer.mergeFactor", 10);
writer.minMergeDocs = job.getInt("indexer.minMergeDocs", 100);
@@ -92,7 +92,7 @@
writer.setTermIndexInterval
(job.getInt("indexer.termIndexInterval", 128));
writer.maxFieldLength = job.getInt("indexer.max.tokens", 10000);
- //writer.infoStream = LogFormatter.getLogStream(LOG, Level.FINE);
+ writer.infoStream = LogFormatter.getLogStream(LOG, Level.INFO);
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
@@ -133,6 +133,8 @@
}
}
+ private IndexingFilters filters;
+
public Indexer() {
super(null);
}
@@ -146,6 +148,8 @@
public void configure(JobConf job) {
scorePower = job.getFloat("indexer.score.power", 0.5f);
+ setConf(job);
+ this.filters = new IndexingFilters(getConf());
}
public void reduce(WritableComparable key, Iterator values,
@@ -217,7 +221,7 @@
try {
// run indexing filters
- doc = IndexingFilters.filter(doc,new ParseImpl(parseText, parseData), (UTF8)key, fetchDatum, inlinks);
+ doc = this.filters.filter(doc,new ParseImpl(parseText, parseData), (UTF8)key, fetchDatum, inlinks);
} catch (IndexingException e) {
LOG.warning("Error indexing "+key+": "+e);
return;
@@ -261,7 +265,7 @@
}
public static void main(String[] args) throws Exception {
- Indexer indexer = new Indexer(NutchConf.get());
+ Indexer indexer = new Indexer(new NutchConf());
if (args.length < 4) {
System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java Tue Jan 31 08:08:58 2006
@@ -18,6 +18,7 @@
import org.apache.lucene.document.Document;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfigurable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.io.UTF8;
@@ -26,7 +27,7 @@
* fields. All plugins found which implement this extension point are run
* sequentially on the parse.
*/
-public interface IndexingFilter {
+public interface IndexingFilter extends NutchConfigurable {
/** The name of the extension point. */
final static String X_POINT_ID = IndexingFilter.class.getName();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=373853&r1=373852&r2=373853&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Tue Jan 31 08:08:58 2006
@@ -22,6 +22,7 @@
import org.apache.nutch.plugin.*;
import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConf;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.io.UTF8;
@@ -29,37 +30,38 @@
/** Creates and caches {@link IndexingFilter} implementing plugins.*/
public class IndexingFilters {
- private static final IndexingFilter[] CACHE;
- static {
- try {
- ExtensionPoint point = PluginRepository.getInstance()
- .getExtensionPoint(IndexingFilter.X_POINT_ID);
- if (point == null)
- throw new RuntimeException(IndexingFilter.X_POINT_ID+" not found.");
- Extension[] extensions = point.getExtensions();
- HashMap filterMap = new HashMap();
- for (int i = 0; i < extensions.length; i++) {
- Extension extension = extensions[i];
- IndexingFilter filter = (IndexingFilter)extension.getExtensionInstance();
- System.out.println("-adding " + filter.getClass().getName());
- if (!filterMap.containsKey(filter.getClass().getName())) {
- filterMap.put(filter.getClass().getName(), filter);
- }
- }
- CACHE = (IndexingFilter[])filterMap.values().toArray(new IndexingFilter[0]);
- } catch (PluginRuntimeException e) {
- throw new RuntimeException(e);
- }
- }
+ private IndexingFilter[] indexingFilters;
- private IndexingFilters() {} // no public ctor
+ public IndexingFilters(NutchConf nutchConf) {
+ this.indexingFilters =(IndexingFilter[]) nutchConf.getObject(IndexingFilter.class.getName());
+ if (this.indexingFilters == null) {
+ try {
+ ExtensionPoint point = nutchConf.getPluginRepository().getExtensionPoint(IndexingFilter.X_POINT_ID);
+ if (point == null)
+ throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found.");
+ Extension[] extensions = point.getExtensions();
+ HashMap filterMap = new HashMap();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ IndexingFilter filter = (IndexingFilter) extension.getExtensionInstance();
+ System.out.println("-adding " + filter.getClass().getName());
+ if (!filterMap.containsKey(filter.getClass().getName())) {
+ filterMap.put(filter.getClass().getName(), filter);
+ }
+ }
+ nutchConf.setObject(IndexingFilter.class.getName(), (IndexingFilter[]) filterMap.values().toArray(new IndexingFilter[0]));
+ } catch (PluginRuntimeException e) {
+ throw new RuntimeException(e);
+ }
+ this.indexingFilters =(IndexingFilter[]) nutchConf.getObject(IndexingFilter.class.getName());
+ }
+ }
/** Run all defined filters. */
- public static Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
-
- for (int i = 0; i < CACHE.length; i++) {
- doc = CACHE[i].filter(doc, parse, url, datum, inlinks);
+ for (int i = 0; i < this.indexingFilters.length; i++) {
+ doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
}
return doc;