You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2017/02/25 14:34:52 UTC
[nutch] 01/03: Merge branch 'master' into NUTCH-2292-1
This is an automated email from the ASF dual-hosted git repository.
thammegowda pushed a commit to branch NUTCH-2292
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 98cd385b35bbd6b5b3a110b745f0eddd238b9456
Merge: 2175c76 3e2d3d4
Author: Thamme Gowda <th...@apache.org>
AuthorDate: Fri Feb 24 11:36:40 2017 -0800
Merge branch 'master' into NUTCH-2292-1
build.xml | 3 +
conf/httpclient-auth.xml.template | 6 +
conf/nutch-default.xml | 75 +++++++++
default.properties | 4 +-
ivy/ivy.xml | 3 +
ivy/mvn.template | 10 +-
.../apache/nutch/crawl/AbstractFetchSchedule.java | 4 +-
.../apache/nutch/crawl/AdaptiveFetchSchedule.java | 7 +-
.../java/org/apache/nutch/crawl/CrawlDatum.java | 8 +-
.../main/java/org/apache/nutch/crawl/CrawlDb.java | 25 ++-
.../java/org/apache/nutch/crawl/CrawlDbFilter.java | 4 +-
.../java/org/apache/nutch/crawl/CrawlDbMerger.java | 5 +-
.../java/org/apache/nutch/crawl/CrawlDbReader.java | 18 +-
.../org/apache/nutch/crawl/CrawlDbReducer.java | 7 +-
.../org/apache/nutch/crawl/DeduplicationJob.java | 7 +-
.../apache/nutch/crawl/DefaultFetchSchedule.java | 4 +
.../apache/nutch/crawl/FetchScheduleFactory.java | 6 +-
.../java/org/apache/nutch/crawl/Generator.java | 10 +-
.../main/java/org/apache/nutch/crawl/Injector.java | 39 +++--
.../main/java/org/apache/nutch/crawl/Inlinks.java | 8 +-
.../main/java/org/apache/nutch/crawl/LinkDb.java | 25 ++-
.../java/org/apache/nutch/crawl/LinkDbFilter.java | 4 +-
.../java/org/apache/nutch/crawl/LinkDbMerger.java | 6 +-
.../java/org/apache/nutch/crawl/LinkDbReader.java | 6 +-
.../nutch/crawl/MimeAdaptiveFetchSchedule.java | 7 +-
.../org/apache/nutch/crawl/SignatureFactory.java | 4 +-
.../apache/nutch/crawl/TextProfileSignature.java | 6 +-
.../org/apache/nutch/crawl/URLPartitioner.java | 3 +-
.../java/org/apache/nutch/fetcher/FetchItem.java | 4 +-
.../org/apache/nutch/fetcher/FetchItemQueue.java | 6 +-
.../org/apache/nutch/fetcher/FetchItemQueues.java | 8 +-
.../java/org/apache/nutch/fetcher/FetchNodeDb.java | 2 +-
.../java/org/apache/nutch/fetcher/Fetcher.java | 29 ++--
.../org/apache/nutch/fetcher/FetcherThread.java | 76 ++++++++-
.../java/org/apache/nutch/fetcher/QueueFeeder.java | 5 +-
.../java/org/apache/nutch/hostdb/ReadHostDb.java | 4 +-
.../org/apache/nutch/hostdb/ResolverThread.java | 4 +-
.../java/org/apache/nutch/hostdb/UpdateHostDb.java | 4 +-
.../apache/nutch/hostdb/UpdateHostDbMapper.java | 6 +-
.../apache/nutch/hostdb/UpdateHostDbReducer.java | 18 +-
.../java/org/apache/nutch/indexer/CleaningJob.java | 4 +-
.../org/apache/nutch/indexer/IndexWriters.java | 6 +-
.../org/apache/nutch/indexer/IndexerMapReduce.java | 5 +-
.../org/apache/nutch/indexer/IndexingFilters.java | 6 +-
.../nutch/indexer/IndexingFiltersChecker.java | 7 +-
.../java/org/apache/nutch/indexer/IndexingJob.java | 25 ++-
.../org/apache/nutch/indexer/NutchDocument.java | 2 +-
.../java/org/apache/nutch/indexer/NutchField.java | 4 +-
.../java/org/apache/nutch/metadata/Metadata.java | 2 +-
.../main/java/org/apache/nutch/metadata/Nutch.java | 13 ++
.../nutch/metadata/SpellCheckedMetadata.java | 2 +-
.../org/apache/nutch/net/URLExemptionFilters.java | 5 +-
.../java/org/apache/nutch/net/URLNormalizers.java | 13 +-
.../org/apache/nutch/parse/OutlinkExtractor.java | 5 +-
.../java/org/apache/nutch/parse/ParseData.java | 5 +-
.../org/apache/nutch/parse/ParseOutputFormat.java | 7 +-
.../org/apache/nutch/parse/ParsePluginList.java | 4 +-
.../org/apache/nutch/parse/ParsePluginsReader.java | 9 +-
.../java/org/apache/nutch/parse/ParseResult.java | 6 +-
.../java/org/apache/nutch/parse/ParseSegment.java | 30 ++--
.../java/org/apache/nutch/parse/ParseText.java | 5 +-
.../java/org/apache/nutch/parse/ParseUtil.java | 4 +-
.../java/org/apache/nutch/parse/ParserChecker.java | 6 +-
.../java/org/apache/nutch/parse/ParserFactory.java | 8 +-
.../java/org/apache/nutch/plugin/Extension.java | 2 +-
.../org/apache/nutch/plugin/ExtensionPoint.java | 2 +-
.../org/apache/nutch/plugin/PluginDescriptor.java | 21 +--
.../apache/nutch/plugin/PluginManifestParser.java | 2 +-
.../org/apache/nutch/plugin/PluginRepository.java | 35 ++--
.../java/org/apache/nutch/protocol/Content.java | 5 +-
.../java/org/apache/nutch/protocol/Protocol.java | 38 ++---
.../org/apache/nutch/protocol/ProtocolFactory.java | 5 +-
.../org/apache/nutch/protocol/ProtocolOutput.java | 14 ++
.../org/apache/nutch/protocol/ProtocolStatus.java | 2 +-
.../apache/nutch/protocol/RobotRulesParser.java | 183 ++++++++++++++++-----
.../apache/nutch/scoring/webgraph/LinkDumper.java | 10 +-
.../apache/nutch/scoring/webgraph/LinkRank.java | 10 +-
.../apache/nutch/scoring/webgraph/NodeDumper.java | 4 +-
.../apache/nutch/scoring/webgraph/NodeReader.java | 2 +-
.../nutch/scoring/webgraph/ScoreUpdater.java | 4 +-
.../apache/nutch/scoring/webgraph/WebGraph.java | 12 +-
.../nutch/segment/ContentAsTextInputFormat.java | 2 +-
.../org/apache/nutch/segment/SegmentChecker.java | 5 +-
.../apache/nutch/segment/SegmentMergeFilters.java | 3 +-
.../org/apache/nutch/segment/SegmentMerger.java | 13 +-
.../org/apache/nutch/segment/SegmentReader.java | 66 +++++---
.../java/org/apache/nutch/service/NutchReader.java | 6 +-
.../java/org/apache/nutch/service/NutchServer.java | 15 +-
.../org/apache/nutch/service/impl/JobWorker.java | 4 +-
.../org/apache/nutch/service/impl/LinkReader.java | 8 +-
.../org/apache/nutch/service/impl/NodeReader.java | 8 +-
.../apache/nutch/service/impl/SequenceReader.java | 12 +-
.../nutch/service/model/request/DbQuery.java | 2 +-
.../nutch/service/model/request/SeedList.java | 10 ++
.../service/model/response/FetchNodeDbInfo.java | 2 +-
.../nutch/service/resources/AdminResource.java | 3 +-
.../apache/nutch/service/resources/DbResource.java | 2 +-
.../nutch/service/resources/SeedResource.java | 105 ++++++------
.../nutch/tools/AbstractCommonCrawlFormat.java | 4 +-
.../java/org/apache/nutch/tools/Benchmark.java | 8 +-
.../apache/nutch/tools/CommonCrawlDataDumper.java | 7 +-
.../nutch/tools/CommonCrawlFormatJettinson.java | 4 +-
.../java/org/apache/nutch/tools/DmozParser.java | 26 +--
.../java/org/apache/nutch/tools/FileDumper.java | 40 ++---
.../java/org/apache/nutch/tools/FreeGenerator.java | 5 +-
.../java/org/apache/nutch/tools/ResolveUrls.java | 4 +-
.../apache/nutch/tools/arc/ArcRecordReader.java | 5 +-
.../apache/nutch/tools/arc/ArcSegmentCreator.java | 5 +-
.../org/apache/nutch/tools/warc/WARCExporter.java | 6 +-
.../apache/nutch/util/CrawlCompletionStats.java | 3 +-
.../java/org/apache/nutch/util/DeflateUtils.java | 4 +-
.../main/java/org/apache/nutch/util/DomUtil.java | 4 +-
.../java/org/apache/nutch/util/DumpFileUtil.java | 5 +-
.../org/apache/nutch/util/EncodingDetector.java | 15 +-
.../main/java/org/apache/nutch/util/GZIPUtils.java | 4 +-
.../java/org/apache/nutch/util/HadoopFSUtil.java | 19 +--
.../main/java/org/apache/nutch/util/JexlUtil.java | 4 +-
.../main/java/org/apache/nutch/util/MimeUtil.java | 10 +-
.../java/org/apache/nutch/util/NodeWalker.java | 2 +-
.../main/java/org/apache/nutch/util/NutchTool.java | 2 +-
.../java/org/apache/nutch/util/ObjectCache.java | 8 +-
.../nutch/util/ProtocolStatusStatistics.java | 3 +-
.../org/apache/nutch/util/TrieStringMatcher.java | 4 +-
.../apache/nutch/util/domain/DomainStatistics.java | 3 +-
.../apache/nutch/util/domain/DomainSuffixes.java | 5 +-
.../nutch/util/domain/DomainSuffixesReader.java | 3 +-
.../nutch/webui/client/impl/CrawlingCycle.java | 6 +-
.../webui/client/impl/RemoteCommandExecutor.java | 6 +-
.../webui/pages/components/ColorEnumLabel.java | 2 +-
.../pages/components/ColorEnumLabelBuilder.java | 2 +-
.../webui/pages/components/CpmIteratorAdapter.java | 2 +-
.../nutch/webui/pages/crawls/CrawlPanel.java | 8 +-
.../nutch/webui/pages/crawls/CrawlsPage.java | 4 +-
.../nutch/webui/pages/instances/InstancePanel.java | 2 +-
.../nutch/webui/pages/instances/InstancesPage.java | 4 +-
.../nutch/webui/pages/seed/SeedListsPage.java | 4 +-
.../apache/nutch/webui/pages/seed/SeedPage.java | 6 +-
.../nutch/webui/pages/settings/SettingsPage.java | 4 +-
.../nutch/webui/service/impl/CrawlServiceImpl.java | 6 +-
.../nutch/webui/service/impl/NutchServiceImpl.java | 7 +-
.../nutch/crawl/ContinuousCrawlTestUtil.java | 3 +-
.../org/apache/nutch/crawl/CrawlDBTestUtil.java | 3 +-
.../nutch/crawl/CrawlDbUpdateTestDriver.java | 3 +-
.../org/apache/nutch/crawl/CrawlDbUpdateUtil.java | 3 +-
.../apache/nutch/crawl/TODOTestCrawlDbStates.java | 4 +-
.../org/apache/nutch/crawl/TestCrawlDbMerger.java | 18 +-
.../org/apache/nutch/crawl/TestCrawlDbStates.java | 18 +-
.../org/apache/nutch/crawl/TestLinkDbMerger.java | 18 +-
.../apache/nutch/indexer/TestIndexerMapReduce.java | 4 +-
.../segment/TestSegmentMergerCrawlDatums.java | 3 +-
.../org/apache/nutch/service/TestNutchServer.java | 4 +-
.../apache/nutch/tools/proxy/LogDebugHandler.java | 3 +-
.../org/apache/nutch/tools/proxy/ProxyTestbed.java | 4 +-
.../apache/nutch/tools/proxy/SegmentHandler.java | 3 +-
nutch-plugins/build.xml | 1 +
.../creativecommons/nutch/CCIndexingFilter.java | 5 +-
.../org/creativecommons/nutch/CCParseFilter.java | 4 +-
.../org/apache/nutch/parse/feed/FeedParser.java | 4 +-
.../apache/nutch/parse/feed/TestFeedParser.java | 5 +-
.../nutch/indexer/anchor/AnchorIndexingFilter.java | 5 +-
.../nutch/indexer/basic/BasicIndexingFilter.java | 5 +-
.../nutch/indexer/geoip/GeoIPIndexingFilter.java | 3 +-
.../nutch/indexer/links/LinksIndexingFilter.java | 7 +-
.../nutch/indexer/more/MoreIndexingFilter.java | 5 +-
.../cloudsearch/CloudSearchIndexWriter.java | 5 +-
.../nutch/indexwriter/dummy/DummyIndexWriter.java | 5 +-
.../indexwriter/elastic/ElasticIndexWriter.java | 4 +-
.../nutch/indexwriter/solr/SolrIndexWriter.java | 5 +-
.../nutch/indexwriter/solr/SolrMappingReader.java | 4 +-
.../apache/nutch/indexwriter/solr/SolrUtils.java | 14 +-
.../nutch/analysis/lang/HTMLLanguageParser.java | 5 +-
.../nutch/protocol/htmlunit/HtmlUnitWebDriver.java | 4 +-
.../apache/nutch/protocol/http/api/HttpBase.java | 71 ++++----
.../protocol/http/api/HttpRobotRulesParser.java | 57 ++++++-
.../nutch/urlfilter/api/RegexURLFilterBase.java | 5 +-
.../urlfilter/api/RegexURLFilterBaseTest.java | 5 +-
.../nutch/protocol/selenium/HttpWebClient.java | 4 +-
.../nutch/microformats/reltag/RelTagParser.java | 4 +-
.../indexer/filter/MimeTypeIndexingFilter.java | 3 +-
nutch-plugins/nutch-extensionpoints/plugin.xml | 4 +
.../java/org/apache/nutch/parse/ext/ExtParser.java | 5 +-
.../org/apache/nutch/parse/html/HtmlParser.java | 5 +-
.../apache/nutch/parse/html/TestHtmlParser.java | 5 +-
.../org/apache/nutch/parse/js/JSParseFilter.java | 4 +-
.../java/org/apache/nutch/parse/swf/SWFParser.java | 5 +-
.../org/apache/nutch/parse/tika/TikaParser.java | 4 +-
.../java/org/apache/nutch/tika/TestFeedParser.java | 6 +-
.../java/org/apache/nutch/parse/zip/ZipParser.java | 4 +-
.../apache/nutch/parse/zip/ZipTextExtractor.java | 5 +-
.../naivebayes/NaiveBayesParseFilter.java | 3 +-
.../nutch/parsefilter/regex/RegexParseFilter.java | 24 ++-
.../java/org/apache/nutch/protocol/file/File.java | 17 +-
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 13 +-
.../nutch/protocol/ftp/FtpRobotRulesParser.java | 22 ++-
.../org/apache/nutch/protocol/htmlunit/Http.java | 4 +-
.../java/org/apache/nutch/protocol/http/Http.java | 4 +-
.../apache/nutch/protocol/http/HttpResponse.java | 11 +-
.../httpclient/DummySSLProtocolSocketFactory.java | 3 +-
.../protocol/httpclient/DummyX509TrustManager.java | 3 +-
.../org/apache/nutch/protocol/httpclient/Http.java | 83 ++++++----
.../httpclient/HttpAuthenticationFactory.java | 5 +-
.../httpclient/HttpBasicAuthentication.java | 5 +-
.../httpclient/HttpFormAuthConfigurer.java | 21 ++-
.../httpclient/HttpFormAuthentication.java | 85 +++++++---
.../nutch/protocol/httpclient/HttpResponse.java | 7 +
.../nutch/protocol/interactiveselenium/Http.java | 4 +-
.../handlers/DefalultMultiInteractionHandler.java | 4 +-
.../handlers/DefaultClickAllAjaxLinksHandler.java | 3 +-
.../org/apache/nutch/protocol/selenium/Http.java | 4 +-
.../nutch/scoring/opic/OPICScoringFilter.java | 5 +-
.../similarity/cosine/CosineSimilarity.java | 7 +-
.../nutch/scoring/similarity/cosine/Model.java | 6 +-
.../apache/nutch/collection/CollectionManager.java | 4 +-
.../subcollection/SubcollectionIndexingFilter.java | 6 +-
.../nutch/indexer/tld/TLDIndexingFilter.java | 5 +-
.../nutch/urlfilter/domain/DomainURLFilter.java | 3 +-
.../domainblacklist/DomainBlacklistURLFilter.java | 3 +-
.../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 5 +-
.../nutch/urlfilter/prefix/PrefixURLFilter.java | 3 +-
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 3 +-
.../indexer/urlmeta/URLMetaIndexingFilter.java | 4 +-
.../scoring/urlmeta/URLMetaScoringFilter.java | 3 +-
.../net/urlnormalizer/ajax/AjaxURLNormalizer.java | 4 +-
.../urlnormalizer/basic/BasicURLNormalizer.java | 21 ++-
.../basic/TestBasicURLNormalizer.java | 11 +-
.../net/urlnormalizer/host/HostURLNormalizer.java | 3 +-
.../protocol/ProtocolURLNormalizer.java | 4 +-
.../querystring/QuerystringURLNormalizer.java | 3 +-
.../urlnormalizer/regex/RegexURLNormalizer.java | 3 +-
.../regex/TestRegexURLNormalizer.java | 3 +-
.../urlnormalizer/slash/SlashURLNormalizer.java | 4 +-
.../apache/nutch/fetcher/FetcherThreadEvent.java | 147 +++++++++++++++++
.../nutch/fetcher/FetcherThreadPublisher.java | 61 +++++++
.../org/apache/nutch/publisher/NutchPublisher.java | 47 +++---
.../apache/nutch/publisher/NutchPublishers.java | 83 ++++++++++
.../java/org/apache/nutch/service/SeedManager.java | 34 ++--
.../apache/nutch/service/impl/SeedManagerImpl.java | 60 +++----
src/plugin/parsefilter-regex/README.txt | 41 +++++
src/plugin/publish-rabbitmq/build-ivy.xml | 54 ++++++
src/plugin/publish-rabbitmq/build.xml | 27 +++
src/plugin/publish-rabbitmq/ivy.xml | 42 +++++
src/plugin/publish-rabbitmq/plugin.xml | 43 +++++
.../publisher/rabbitmq/RabbitMQPublisherImpl.java | 95 +++++++++++
.../nutch/publisher/rabbitmq/package-info.java | 25 +--
244 files changed, 2202 insertions(+), 927 deletions(-)
diff --cc nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
index 599c353,7c4b2eb..bfb1581
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@@ -36,11 -37,10 +38,11 @@@ import org.junit.After
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
+import org.junit.experimental.categories.Category;
public class TestCrawlDbMerger {
- private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
- .getName());
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
String url10 = "http://example.com/";
String url11 = "http://example.com/foo";
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.