You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2017/02/25 14:34:52 UTC

[nutch] 01/03: Merge branch 'master' into NUTCH-2292-1

This is an automated email from the ASF dual-hosted git repository.

thammegowda pushed a commit to branch NUTCH-2292
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 98cd385b35bbd6b5b3a110b745f0eddd238b9456
Merge: 2175c76 3e2d3d4
Author: Thamme Gowda <th...@apache.org>
AuthorDate: Fri Feb 24 11:36:40 2017 -0800

    Merge branch 'master' into NUTCH-2292-1

 build.xml                                          |   3 +
 conf/httpclient-auth.xml.template                  |   6 +
 conf/nutch-default.xml                             |  75 +++++++++
 default.properties                                 |   4 +-
 ivy/ivy.xml                                        |   3 +
 ivy/mvn.template                                   |  10 +-
 .../apache/nutch/crawl/AbstractFetchSchedule.java  |   4 +-
 .../apache/nutch/crawl/AdaptiveFetchSchedule.java  |   7 +-
 .../java/org/apache/nutch/crawl/CrawlDatum.java    |   8 +-
 .../main/java/org/apache/nutch/crawl/CrawlDb.java  |  25 ++-
 .../java/org/apache/nutch/crawl/CrawlDbFilter.java |   4 +-
 .../java/org/apache/nutch/crawl/CrawlDbMerger.java |   5 +-
 .../java/org/apache/nutch/crawl/CrawlDbReader.java |  18 +-
 .../org/apache/nutch/crawl/CrawlDbReducer.java     |   7 +-
 .../org/apache/nutch/crawl/DeduplicationJob.java   |   7 +-
 .../apache/nutch/crawl/DefaultFetchSchedule.java   |   4 +
 .../apache/nutch/crawl/FetchScheduleFactory.java   |   6 +-
 .../java/org/apache/nutch/crawl/Generator.java     |  10 +-
 .../main/java/org/apache/nutch/crawl/Injector.java |  39 +++--
 .../main/java/org/apache/nutch/crawl/Inlinks.java  |   8 +-
 .../main/java/org/apache/nutch/crawl/LinkDb.java   |  25 ++-
 .../java/org/apache/nutch/crawl/LinkDbFilter.java  |   4 +-
 .../java/org/apache/nutch/crawl/LinkDbMerger.java  |   6 +-
 .../java/org/apache/nutch/crawl/LinkDbReader.java  |   6 +-
 .../nutch/crawl/MimeAdaptiveFetchSchedule.java     |   7 +-
 .../org/apache/nutch/crawl/SignatureFactory.java   |   4 +-
 .../apache/nutch/crawl/TextProfileSignature.java   |   6 +-
 .../org/apache/nutch/crawl/URLPartitioner.java     |   3 +-
 .../java/org/apache/nutch/fetcher/FetchItem.java   |   4 +-
 .../org/apache/nutch/fetcher/FetchItemQueue.java   |   6 +-
 .../org/apache/nutch/fetcher/FetchItemQueues.java  |   8 +-
 .../java/org/apache/nutch/fetcher/FetchNodeDb.java |   2 +-
 .../java/org/apache/nutch/fetcher/Fetcher.java     |  29 ++--
 .../org/apache/nutch/fetcher/FetcherThread.java    |  76 ++++++++-
 .../java/org/apache/nutch/fetcher/QueueFeeder.java |   5 +-
 .../java/org/apache/nutch/hostdb/ReadHostDb.java   |   4 +-
 .../org/apache/nutch/hostdb/ResolverThread.java    |   4 +-
 .../java/org/apache/nutch/hostdb/UpdateHostDb.java |   4 +-
 .../apache/nutch/hostdb/UpdateHostDbMapper.java    |   6 +-
 .../apache/nutch/hostdb/UpdateHostDbReducer.java   |  18 +-
 .../java/org/apache/nutch/indexer/CleaningJob.java |   4 +-
 .../org/apache/nutch/indexer/IndexWriters.java     |   6 +-
 .../org/apache/nutch/indexer/IndexerMapReduce.java |   5 +-
 .../org/apache/nutch/indexer/IndexingFilters.java  |   6 +-
 .../nutch/indexer/IndexingFiltersChecker.java      |   7 +-
 .../java/org/apache/nutch/indexer/IndexingJob.java |  25 ++-
 .../org/apache/nutch/indexer/NutchDocument.java    |   2 +-
 .../java/org/apache/nutch/indexer/NutchField.java  |   4 +-
 .../java/org/apache/nutch/metadata/Metadata.java   |   2 +-
 .../main/java/org/apache/nutch/metadata/Nutch.java |  13 ++
 .../nutch/metadata/SpellCheckedMetadata.java       |   2 +-
 .../org/apache/nutch/net/URLExemptionFilters.java  |   5 +-
 .../java/org/apache/nutch/net/URLNormalizers.java  |  13 +-
 .../org/apache/nutch/parse/OutlinkExtractor.java   |   5 +-
 .../java/org/apache/nutch/parse/ParseData.java     |   5 +-
 .../org/apache/nutch/parse/ParseOutputFormat.java  |   7 +-
 .../org/apache/nutch/parse/ParsePluginList.java    |   4 +-
 .../org/apache/nutch/parse/ParsePluginsReader.java |   9 +-
 .../java/org/apache/nutch/parse/ParseResult.java   |   6 +-
 .../java/org/apache/nutch/parse/ParseSegment.java  |  30 ++--
 .../java/org/apache/nutch/parse/ParseText.java     |   5 +-
 .../java/org/apache/nutch/parse/ParseUtil.java     |   4 +-
 .../java/org/apache/nutch/parse/ParserChecker.java |   6 +-
 .../java/org/apache/nutch/parse/ParserFactory.java |   8 +-
 .../java/org/apache/nutch/plugin/Extension.java    |   2 +-
 .../org/apache/nutch/plugin/ExtensionPoint.java    |   2 +-
 .../org/apache/nutch/plugin/PluginDescriptor.java  |  21 +--
 .../apache/nutch/plugin/PluginManifestParser.java  |   2 +-
 .../org/apache/nutch/plugin/PluginRepository.java  |  35 ++--
 .../java/org/apache/nutch/protocol/Content.java    |   5 +-
 .../java/org/apache/nutch/protocol/Protocol.java   |  38 ++---
 .../org/apache/nutch/protocol/ProtocolFactory.java |   5 +-
 .../org/apache/nutch/protocol/ProtocolOutput.java  |  14 ++
 .../org/apache/nutch/protocol/ProtocolStatus.java  |   2 +-
 .../apache/nutch/protocol/RobotRulesParser.java    | 183 ++++++++++++++++-----
 .../apache/nutch/scoring/webgraph/LinkDumper.java  |  10 +-
 .../apache/nutch/scoring/webgraph/LinkRank.java    |  10 +-
 .../apache/nutch/scoring/webgraph/NodeDumper.java  |   4 +-
 .../apache/nutch/scoring/webgraph/NodeReader.java  |   2 +-
 .../nutch/scoring/webgraph/ScoreUpdater.java       |   4 +-
 .../apache/nutch/scoring/webgraph/WebGraph.java    |  12 +-
 .../nutch/segment/ContentAsTextInputFormat.java    |   2 +-
 .../org/apache/nutch/segment/SegmentChecker.java   |   5 +-
 .../apache/nutch/segment/SegmentMergeFilters.java  |   3 +-
 .../org/apache/nutch/segment/SegmentMerger.java    |  13 +-
 .../org/apache/nutch/segment/SegmentReader.java    |  66 +++++---
 .../java/org/apache/nutch/service/NutchReader.java |   6 +-
 .../java/org/apache/nutch/service/NutchServer.java |  15 +-
 .../org/apache/nutch/service/impl/JobWorker.java   |   4 +-
 .../org/apache/nutch/service/impl/LinkReader.java  |   8 +-
 .../org/apache/nutch/service/impl/NodeReader.java  |   8 +-
 .../apache/nutch/service/impl/SequenceReader.java  |  12 +-
 .../nutch/service/model/request/DbQuery.java       |   2 +-
 .../nutch/service/model/request/SeedList.java      |  10 ++
 .../service/model/response/FetchNodeDbInfo.java    |   2 +-
 .../nutch/service/resources/AdminResource.java     |   3 +-
 .../apache/nutch/service/resources/DbResource.java |   2 +-
 .../nutch/service/resources/SeedResource.java      | 105 ++++++------
 .../nutch/tools/AbstractCommonCrawlFormat.java     |   4 +-
 .../java/org/apache/nutch/tools/Benchmark.java     |   8 +-
 .../apache/nutch/tools/CommonCrawlDataDumper.java  |   7 +-
 .../nutch/tools/CommonCrawlFormatJettinson.java    |   4 +-
 .../java/org/apache/nutch/tools/DmozParser.java    |  26 +--
 .../java/org/apache/nutch/tools/FileDumper.java    |  40 ++---
 .../java/org/apache/nutch/tools/FreeGenerator.java |   5 +-
 .../java/org/apache/nutch/tools/ResolveUrls.java   |   4 +-
 .../apache/nutch/tools/arc/ArcRecordReader.java    |   5 +-
 .../apache/nutch/tools/arc/ArcSegmentCreator.java  |   5 +-
 .../org/apache/nutch/tools/warc/WARCExporter.java  |   6 +-
 .../apache/nutch/util/CrawlCompletionStats.java    |   3 +-
 .../java/org/apache/nutch/util/DeflateUtils.java   |   4 +-
 .../main/java/org/apache/nutch/util/DomUtil.java   |   4 +-
 .../java/org/apache/nutch/util/DumpFileUtil.java   |   5 +-
 .../org/apache/nutch/util/EncodingDetector.java    |  15 +-
 .../main/java/org/apache/nutch/util/GZIPUtils.java |   4 +-
 .../java/org/apache/nutch/util/HadoopFSUtil.java   |  19 +--
 .../main/java/org/apache/nutch/util/JexlUtil.java  |   4 +-
 .../main/java/org/apache/nutch/util/MimeUtil.java  |  10 +-
 .../java/org/apache/nutch/util/NodeWalker.java     |   2 +-
 .../main/java/org/apache/nutch/util/NutchTool.java |   2 +-
 .../java/org/apache/nutch/util/ObjectCache.java    |   8 +-
 .../nutch/util/ProtocolStatusStatistics.java       |   3 +-
 .../org/apache/nutch/util/TrieStringMatcher.java   |   4 +-
 .../apache/nutch/util/domain/DomainStatistics.java |   3 +-
 .../apache/nutch/util/domain/DomainSuffixes.java   |   5 +-
 .../nutch/util/domain/DomainSuffixesReader.java    |   3 +-
 .../nutch/webui/client/impl/CrawlingCycle.java     |   6 +-
 .../webui/client/impl/RemoteCommandExecutor.java   |   6 +-
 .../webui/pages/components/ColorEnumLabel.java     |   2 +-
 .../pages/components/ColorEnumLabelBuilder.java    |   2 +-
 .../webui/pages/components/CpmIteratorAdapter.java |   2 +-
 .../nutch/webui/pages/crawls/CrawlPanel.java       |   8 +-
 .../nutch/webui/pages/crawls/CrawlsPage.java       |   4 +-
 .../nutch/webui/pages/instances/InstancePanel.java |   2 +-
 .../nutch/webui/pages/instances/InstancesPage.java |   4 +-
 .../nutch/webui/pages/seed/SeedListsPage.java      |   4 +-
 .../apache/nutch/webui/pages/seed/SeedPage.java    |   6 +-
 .../nutch/webui/pages/settings/SettingsPage.java   |   4 +-
 .../nutch/webui/service/impl/CrawlServiceImpl.java |   6 +-
 .../nutch/webui/service/impl/NutchServiceImpl.java |   7 +-
 .../nutch/crawl/ContinuousCrawlTestUtil.java       |   3 +-
 .../org/apache/nutch/crawl/CrawlDBTestUtil.java    |   3 +-
 .../nutch/crawl/CrawlDbUpdateTestDriver.java       |   3 +-
 .../org/apache/nutch/crawl/CrawlDbUpdateUtil.java  |   3 +-
 .../apache/nutch/crawl/TODOTestCrawlDbStates.java  |   4 +-
 .../org/apache/nutch/crawl/TestCrawlDbMerger.java  |  18 +-
 .../org/apache/nutch/crawl/TestCrawlDbStates.java  |  18 +-
 .../org/apache/nutch/crawl/TestLinkDbMerger.java   |  18 +-
 .../apache/nutch/indexer/TestIndexerMapReduce.java |   4 +-
 .../segment/TestSegmentMergerCrawlDatums.java      |   3 +-
 .../org/apache/nutch/service/TestNutchServer.java  |   4 +-
 .../apache/nutch/tools/proxy/LogDebugHandler.java  |   3 +-
 .../org/apache/nutch/tools/proxy/ProxyTestbed.java |   4 +-
 .../apache/nutch/tools/proxy/SegmentHandler.java   |   3 +-
 nutch-plugins/build.xml                            |   1 +
 .../creativecommons/nutch/CCIndexingFilter.java    |   5 +-
 .../org/creativecommons/nutch/CCParseFilter.java   |   4 +-
 .../org/apache/nutch/parse/feed/FeedParser.java    |   4 +-
 .../apache/nutch/parse/feed/TestFeedParser.java    |   5 +-
 .../nutch/indexer/anchor/AnchorIndexingFilter.java |   5 +-
 .../nutch/indexer/basic/BasicIndexingFilter.java   |   5 +-
 .../nutch/indexer/geoip/GeoIPIndexingFilter.java   |   3 +-
 .../nutch/indexer/links/LinksIndexingFilter.java   |   7 +-
 .../nutch/indexer/more/MoreIndexingFilter.java     |   5 +-
 .../cloudsearch/CloudSearchIndexWriter.java        |   5 +-
 .../nutch/indexwriter/dummy/DummyIndexWriter.java  |   5 +-
 .../indexwriter/elastic/ElasticIndexWriter.java    |   4 +-
 .../nutch/indexwriter/solr/SolrIndexWriter.java    |   5 +-
 .../nutch/indexwriter/solr/SolrMappingReader.java  |   4 +-
 .../apache/nutch/indexwriter/solr/SolrUtils.java   |  14 +-
 .../nutch/analysis/lang/HTMLLanguageParser.java    |   5 +-
 .../nutch/protocol/htmlunit/HtmlUnitWebDriver.java |   4 +-
 .../apache/nutch/protocol/http/api/HttpBase.java   |  71 ++++----
 .../protocol/http/api/HttpRobotRulesParser.java    |  57 ++++++-
 .../nutch/urlfilter/api/RegexURLFilterBase.java    |   5 +-
 .../urlfilter/api/RegexURLFilterBaseTest.java      |   5 +-
 .../nutch/protocol/selenium/HttpWebClient.java     |   4 +-
 .../nutch/microformats/reltag/RelTagParser.java    |   4 +-
 .../indexer/filter/MimeTypeIndexingFilter.java     |   3 +-
 nutch-plugins/nutch-extensionpoints/plugin.xml     |   4 +
 .../java/org/apache/nutch/parse/ext/ExtParser.java |   5 +-
 .../org/apache/nutch/parse/html/HtmlParser.java    |   5 +-
 .../apache/nutch/parse/html/TestHtmlParser.java    |   5 +-
 .../org/apache/nutch/parse/js/JSParseFilter.java   |   4 +-
 .../java/org/apache/nutch/parse/swf/SWFParser.java |   5 +-
 .../org/apache/nutch/parse/tika/TikaParser.java    |   4 +-
 .../java/org/apache/nutch/tika/TestFeedParser.java |   6 +-
 .../java/org/apache/nutch/parse/zip/ZipParser.java |   4 +-
 .../apache/nutch/parse/zip/ZipTextExtractor.java   |   5 +-
 .../naivebayes/NaiveBayesParseFilter.java          |   3 +-
 .../nutch/parsefilter/regex/RegexParseFilter.java  |  24 ++-
 .../java/org/apache/nutch/protocol/file/File.java  |  17 +-
 .../java/org/apache/nutch/protocol/ftp/Ftp.java    |  13 +-
 .../nutch/protocol/ftp/FtpRobotRulesParser.java    |  22 ++-
 .../org/apache/nutch/protocol/htmlunit/Http.java   |   4 +-
 .../java/org/apache/nutch/protocol/http/Http.java  |   4 +-
 .../apache/nutch/protocol/http/HttpResponse.java   |  11 +-
 .../httpclient/DummySSLProtocolSocketFactory.java  |   3 +-
 .../protocol/httpclient/DummyX509TrustManager.java |   3 +-
 .../org/apache/nutch/protocol/httpclient/Http.java |  83 ++++++----
 .../httpclient/HttpAuthenticationFactory.java      |   5 +-
 .../httpclient/HttpBasicAuthentication.java        |   5 +-
 .../httpclient/HttpFormAuthConfigurer.java         |  21 ++-
 .../httpclient/HttpFormAuthentication.java         |  85 +++++++---
 .../nutch/protocol/httpclient/HttpResponse.java    |   7 +
 .../nutch/protocol/interactiveselenium/Http.java   |   4 +-
 .../handlers/DefalultMultiInteractionHandler.java  |   4 +-
 .../handlers/DefaultClickAllAjaxLinksHandler.java  |   3 +-
 .../org/apache/nutch/protocol/selenium/Http.java   |   4 +-
 .../nutch/scoring/opic/OPICScoringFilter.java      |   5 +-
 .../similarity/cosine/CosineSimilarity.java        |   7 +-
 .../nutch/scoring/similarity/cosine/Model.java     |   6 +-
 .../apache/nutch/collection/CollectionManager.java |   4 +-
 .../subcollection/SubcollectionIndexingFilter.java |   6 +-
 .../nutch/indexer/tld/TLDIndexingFilter.java       |   5 +-
 .../nutch/urlfilter/domain/DomainURLFilter.java    |   3 +-
 .../domainblacklist/DomainBlacklistURLFilter.java  |   3 +-
 .../urlfilter/ignoreexempt/ExemptionUrlFilter.java |   5 +-
 .../nutch/urlfilter/prefix/PrefixURLFilter.java    |   3 +-
 .../nutch/urlfilter/suffix/SuffixURLFilter.java    |   3 +-
 .../indexer/urlmeta/URLMetaIndexingFilter.java     |   4 +-
 .../scoring/urlmeta/URLMetaScoringFilter.java      |   3 +-
 .../net/urlnormalizer/ajax/AjaxURLNormalizer.java  |   4 +-
 .../urlnormalizer/basic/BasicURLNormalizer.java    |  21 ++-
 .../basic/TestBasicURLNormalizer.java              |  11 +-
 .../net/urlnormalizer/host/HostURLNormalizer.java  |   3 +-
 .../protocol/ProtocolURLNormalizer.java            |   4 +-
 .../querystring/QuerystringURLNormalizer.java      |   3 +-
 .../urlnormalizer/regex/RegexURLNormalizer.java    |   3 +-
 .../regex/TestRegexURLNormalizer.java              |   3 +-
 .../urlnormalizer/slash/SlashURLNormalizer.java    |   4 +-
 .../apache/nutch/fetcher/FetcherThreadEvent.java   | 147 +++++++++++++++++
 .../nutch/fetcher/FetcherThreadPublisher.java      |  61 +++++++
 .../org/apache/nutch/publisher/NutchPublisher.java |  47 +++---
 .../apache/nutch/publisher/NutchPublishers.java    |  83 ++++++++++
 .../java/org/apache/nutch/service/SeedManager.java |  34 ++--
 .../apache/nutch/service/impl/SeedManagerImpl.java |  60 +++----
 src/plugin/parsefilter-regex/README.txt            |  41 +++++
 src/plugin/publish-rabbitmq/build-ivy.xml          |  54 ++++++
 src/plugin/publish-rabbitmq/build.xml              |  27 +++
 src/plugin/publish-rabbitmq/ivy.xml                |  42 +++++
 src/plugin/publish-rabbitmq/plugin.xml             |  43 +++++
 .../publisher/rabbitmq/RabbitMQPublisherImpl.java  |  95 +++++++++++
 .../nutch/publisher/rabbitmq/package-info.java     |  25 +--
 244 files changed, 2202 insertions(+), 927 deletions(-)

diff --cc nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
index 599c353,7c4b2eb..bfb1581
--- a/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/nutch-core/src/test/java/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@@ -36,11 -37,10 +38,11 @@@ import org.junit.After
  import org.junit.Assert;
  import org.junit.Before;
  import org.junit.Test;
 +import org.junit.experimental.categories.Category;
  
  public class TestCrawlDbMerger {
-   private static final Logger LOG = Logger.getLogger(CrawlDbMerger.class
-       .getName());
+   private static final Logger LOG = LoggerFactory
+       .getLogger(MethodHandles.lookup().lookupClass());
  
    String url10 = "http://example.com/";
    String url11 = "http://example.com/foo";

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.