You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:35 UTC
[51/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
Re arranged the source code as per maven conventions for build
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ffa16784
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ffa16784
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ffa16784
Branch: refs/heads/NUTCH-2292
Commit: ffa167843999d6434d62ed7f636c9c9ae2eff080
Parents: 4eaeeb6
Author: Thamme Gowda <th...@apache.org>
Authored: Tue Jul 5 15:02:59 2016 -0700
Committer: Thamme Gowda <th...@apache.org>
Committed: Tue Jul 5 15:02:59 2016 -0700
----------------------------------------------------------------------
.gitignore | 6 +
.../resources/fetch-test-site/dup_of_pagea.html | 11 +
.../resources/fetch-test-site/exception.html | 13 +
.../test/resources/fetch-test-site/index.html | 13 +
.../fetch-test-site/nested_spider_trap.html | 23 +
.../test/resources/fetch-test-site/pagea.html | 11 +
.../test/resources/fetch-test-site/pageb.html | 11 +
.../test/resources/fetch-test-site/robots.txt | 0
.../src/test/resources/test-mime-util/test.xlsx | Bin 0 -> 3950 bytes
.../20150309101625/content/part-00000/.data.crc | Bin 0 -> 124 bytes
.../content/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101625/content/part-00000/data | Bin 0 -> 14452 bytes
.../20150309101625/content/part-00000/index | Bin 0 -> 217 bytes
.../crawl_fetch/part-00000/.data.crc | Bin 0 -> 12 bytes
.../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101625/crawl_fetch/part-00000/data | Bin 0 -> 293 bytes
.../20150309101625/crawl_fetch/part-00000/index | Bin 0 -> 217 bytes
.../crawl_generate/.part-00000.crc | Bin 0 -> 12 bytes
.../20150309101625/crawl_generate/part-00000 | Bin 0 -> 169 bytes
.../20150309101625/crawl_parse/.part-00000.crc | Bin 0 -> 68 bytes
.../20150309101625/crawl_parse/part-00000 | Bin 0 -> 7627 bytes
.../parse_data/part-00000/.data.crc | Bin 0 -> 24 bytes
.../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101625/parse_data/part-00000/data | Bin 0 -> 1985 bytes
.../20150309101625/parse_data/part-00000/index | Bin 0 -> 217 bytes
.../parse_text/part-00000/.data.crc | Bin 0 -> 60 bytes
.../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101625/parse_text/part-00000/data | Bin 0 -> 6554 bytes
.../20150309101625/parse_text/part-00000/index | Bin 0 -> 217 bytes
.../20150309101656/content/part-00000/.data.crc | Bin 0 -> 3372 bytes
.../content/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101656/content/part-00000/data | Bin 0 -> 430250 bytes
.../20150309101656/content/part-00000/index | Bin 0 -> 220 bytes
.../crawl_fetch/part-00000/.data.crc | Bin 0 -> 104 bytes
.../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101656/crawl_fetch/part-00000/data | Bin 0 -> 12121 bytes
.../20150309101656/crawl_fetch/part-00000/index | Bin 0 -> 220 bytes
.../crawl_generate/.part-00000.crc | Bin 0 -> 52 bytes
.../20150309101656/crawl_generate/part-00000 | Bin 0 -> 5590 bytes
.../20150309101656/crawl_parse/.part-00000.crc | Bin 0 -> 1652 bytes
.../20150309101656/crawl_parse/part-00000 | Bin 0 -> 210047 bytes
.../parse_data/part-00000/.data.crc | Bin 0 -> 460 bytes
.../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101656/parse_data/part-00000/data | Bin 0 -> 57355 bytes
.../20150309101656/parse_data/part-00000/index | Bin 0 -> 220 bytes
.../parse_text/part-00000/.data.crc | Bin 0 -> 1260 bytes
.../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes
.../20150309101656/parse_text/part-00000/data | Bin 0 -> 159920 bytes
.../20150309101656/parse_text/part-00000/index | Bin 0 -> 220 bytes
nutch-plugins/build-plugin.xml | 255 ++++++
nutch-plugins/build.xml | 213 +++++
nutch-plugins/creativecommons/README.txt | 1 +
nutch-plugins/creativecommons/build.xml | 28 +
.../creativecommons/conf/crawl-urlfilter.txt | 18 +
.../creativecommons/conf/nutch-site.xml | 50 ++
nutch-plugins/creativecommons/data/anchor.html | 9 +
nutch-plugins/creativecommons/data/rdf.html | 35 +
nutch-plugins/creativecommons/data/rel.html | 6 +
nutch-plugins/creativecommons/ivy.xml | 41 +
nutch-plugins/creativecommons/plugin.xml | 48 ++
nutch-plugins/creativecommons/pom.xml | 38 +
.../creativecommons/nutch/CCIndexingFilter.java | 124 +++
.../creativecommons/nutch/CCParseFilter.java | 300 +++++++
.../java/org/creativecommons/nutch/package.html | 5 +
.../nutch/TestCCParseFilter.java | 73 ++
nutch-plugins/feed/build.xml | 45 ++
nutch-plugins/feed/ivy.xml | 43 +
nutch-plugins/feed/plugin.xml | 49 ++
nutch-plugins/feed/pom.xml | 45 ++
nutch-plugins/feed/sample/rsstest.rss | 36 +
.../nutch/indexer/feed/FeedIndexingFilter.java | 129 +++
.../apache/nutch/indexer/feed/package-info.java | 22 +
.../org/apache/nutch/parse/feed/FeedParser.java | 374 +++++++++
.../apache/nutch/parse/feed/package-info.java | 22 +
.../apache/nutch/parse/feed/TestFeedParser.java | 124 +++
nutch-plugins/headings/build.xml | 22 +
nutch-plugins/headings/ivy.xml | 41 +
nutch-plugins/headings/plugin.xml | 45 ++
nutch-plugins/headings/pom.xml | 38 +
.../parse/headings/HeadingsParseFilter.java | 124 +++
.../nutch/parse/headings/package-info.java | 22 +
nutch-plugins/index-anchor/build.xml | 22 +
nutch-plugins/index-anchor/ivy.xml | 41 +
nutch-plugins/index-anchor/plugin.xml | 38 +
nutch-plugins/index-anchor/pom.xml | 38 +
.../indexer/anchor/AnchorIndexingFilter.java | 107 +++
.../apache/nutch/indexer/anchor/package.html | 5 +
.../anchor/TestAnchorIndexingFilter.java | 67 ++
nutch-plugins/index-basic/build.xml | 22 +
nutch-plugins/index-basic/ivy.xml | 41 +
nutch-plugins/index-basic/plugin.xml | 42 +
nutch-plugins/index-basic/pom.xml | 38 +
.../indexer/basic/BasicIndexingFilter.java | 158 ++++
.../org/apache/nutch/indexer/basic/package.html | 5 +
.../indexer/basic/TestBasicIndexingFilter.java | 99 +++
nutch-plugins/index-geoip/build-ivy.xml | 54 ++
nutch-plugins/index-geoip/build.xml | 27 +
nutch-plugins/index-geoip/ivy.xml | 46 ++
nutch-plugins/index-geoip/plugin.xml | 51 ++
nutch-plugins/index-geoip/pom.xml | 55 ++
.../indexer/geoip/GeoIPDocumentCreator.java | 210 +++++
.../indexer/geoip/GeoIPIndexingFilter.java | 241 ++++++
.../nutch/indexer/geoip/package-info.java | 28 +
nutch-plugins/index-links/build.xml | 22 +
nutch-plugins/index-links/ivy.xml | 41 +
nutch-plugins/index-links/plugin.xml | 41 +
nutch-plugins/index-links/pom.xml | 38 +
.../indexer/links/LinksIndexingFilter.java | 167 ++++
.../indexer/links/TestLinksIndexingFilter.java | 218 +++++
.../org/apache/nutch/parse/TestOutlinks.java | 54 ++
nutch-plugins/index-metadata/build.xml | 22 +
nutch-plugins/index-metadata/ivy.xml | 41 +
nutch-plugins/index-metadata/plugin.xml | 42 +
nutch-plugins/index-metadata/pom.xml | 38 +
.../nutch/indexer/metadata/MetadataIndexer.java | 104 +++
.../nutch/indexer/metadata/package-info.java | 23 +
nutch-plugins/index-more/build.xml | 22 +
nutch-plugins/index-more/ivy.xml | 41 +
nutch-plugins/index-more/plugin.xml | 42 +
nutch-plugins/index-more/pom.xml | 38 +
.../nutch/indexer/more/MoreIndexingFilter.java | 344 ++++++++
.../org/apache/nutch/indexer/more/package.html | 6 +
.../indexer/more/TestMoreIndexingFilter.java | 123 +++
nutch-plugins/index-replace/README.txt | 95 +++
nutch-plugins/index-replace/build.xml | 55 ++
nutch-plugins/index-replace/ivy.xml | 41 +
nutch-plugins/index-replace/plugin.xml | 22 +
nutch-plugins/index-replace/pom.xml | 38 +
.../index-replace/sample/testIndexReplace.html | 12 +
.../nutch/indexer/replace/FieldReplacer.java | 196 +++++
.../nutch/indexer/replace/ReplaceIndexer.java | 330 ++++++++
.../nutch/indexer/replace/package-info.java | 22 +
.../nutch/indexer/replace/TestIndexReplace.java | 456 +++++++++++
nutch-plugins/index-static/build.xml | 22 +
nutch-plugins/index-static/ivy.xml | 41 +
nutch-plugins/index-static/plugin.xml | 42 +
nutch-plugins/index-static/pom.xml | 38 +
.../indexer/staticfield/StaticFieldIndexer.java | 143 ++++
.../nutch/indexer/staticfield/package.html | 5 +
.../staticfield/TestStaticFieldIndexerTest.java | 194 +++++
nutch-plugins/indexer-cloudsearch/README.md | 58 ++
nutch-plugins/indexer-cloudsearch/build.xml | 22 +
.../indexer-cloudsearch/createCSDomain.sh | 22 +
nutch-plugins/indexer-cloudsearch/ivy.xml | 41 +
nutch-plugins/indexer-cloudsearch/plugin.xml | 50 ++
nutch-plugins/indexer-cloudsearch/pom.xml | 45 ++
.../cloudsearch/CloudSearchConstants.java | 27 +
.../cloudsearch/CloudSearchIndexWriter.java | 382 +++++++++
.../cloudsearch/CloudSearchUtils.java | 73 ++
nutch-plugins/indexer-dummy/build.xml | 22 +
nutch-plugins/indexer-dummy/ivy.xml | 41 +
nutch-plugins/indexer-dummy/plugin.xml | 38 +
nutch-plugins/indexer-dummy/pom.xml | 38 +
.../indexwriter/dummy/DummyIndexWriter.java | 103 +++
.../nutch/indexwriter/dummy/package-info.java | 23 +
nutch-plugins/indexer-elastic/build-ivy.xml | 54 ++
nutch-plugins/indexer-elastic/build.xml | 22 +
.../indexer-elastic/howto_upgrade_es.txt | 6 +
nutch-plugins/indexer-elastic/ivy.xml | 43 +
nutch-plugins/indexer-elastic/plugin.xml | 71 ++
nutch-plugins/indexer-elastic/pom.xml | 45 ++
.../indexwriter/elastic/ElasticConstants.java | 28 +
.../indexwriter/elastic/ElasticIndexWriter.java | 279 +++++++
.../nutch/indexwriter/elastic/package-info.java | 22 +
nutch-plugins/indexer-solr/build-ivy.xml | 54 ++
nutch-plugins/indexer-solr/build.xml | 22 +
nutch-plugins/indexer-solr/ivy.xml | 44 +
nutch-plugins/indexer-solr/plugin.xml | 48 ++
nutch-plugins/indexer-solr/pom.xml | 55 ++
.../nutch/indexwriter/solr/SolrConstants.java | 56 ++
.../nutch/indexwriter/solr/SolrIndexWriter.java | 277 +++++++
.../indexwriter/solr/SolrMappingReader.java | 147 ++++
.../nutch/indexwriter/solr/SolrUtils.java | 97 +++
.../nutch/indexwriter/solr/package-info.java | 22 +
nutch-plugins/language-identifier/build.xml | 38 +
nutch-plugins/language-identifier/ivy.xml | 41 +
nutch-plugins/language-identifier/plugin.xml | 49 ++
nutch-plugins/language-identifier/pom.xml | 38 +
.../nutch/analysis/lang/HTMLLanguageParser.java | 320 ++++++++
.../analysis/lang/LanguageIndexingFilter.java | 89 +++
.../nutch/analysis/lang/langmappings.properties | 188 +++++
.../org/apache/nutch/analysis/lang/package.html | 6 +
.../analysis/lang/TestHTMLLanguageParser.java | 149 ++++
.../test/org/apache/nutch/analysis/lang/da.test | 108 +++
.../test/org/apache/nutch/analysis/lang/de.test | 104 +++
.../test/org/apache/nutch/analysis/lang/el.test | 109 +++
.../test/org/apache/nutch/analysis/lang/en.test | 105 +++
.../test/org/apache/nutch/analysis/lang/es.test | 107 +++
.../test/org/apache/nutch/analysis/lang/fi.test | 106 +++
.../test/org/apache/nutch/analysis/lang/fr.test | 105 +++
.../test/org/apache/nutch/analysis/lang/it.test | 109 +++
.../test/org/apache/nutch/analysis/lang/nl.test | 105 +++
.../test/org/apache/nutch/analysis/lang/pt.test | 105 +++
.../test/org/apache/nutch/analysis/lang/sv.test | 108 +++
.../nutch/analysis/lang/test-referencial.txt | 10 +
nutch-plugins/lib-htmlunit/build-ivy.xml | 54 ++
nutch-plugins/lib-htmlunit/build.xml | 28 +
nutch-plugins/lib-htmlunit/ivy.xml | 52 ++
nutch-plugins/lib-htmlunit/plugin.xml | 166 ++++
nutch-plugins/lib-htmlunit/pom.xml | 55 ++
.../protocol/htmlunit/HtmlUnitWebDriver.java | 189 +++++
.../htmlunit/HtmlUnitWebWindowListener.java | 53 ++
nutch-plugins/lib-http/build.xml | 22 +
nutch-plugins/lib-http/ivy.xml | 41 +
nutch-plugins/lib-http/plugin.xml | 33 +
nutch-plugins/lib-http/pom.xml | 38 +
.../protocol/http/api/BlockedException.java | 26 +
.../nutch/protocol/http/api/HttpBase.java | 587 ++++++++++++++
.../nutch/protocol/http/api/HttpException.java | 40 +
.../protocol/http/api/HttpRobotRulesParser.java | 167 ++++
.../apache/nutch/protocol/http/api/package.html | 6 +
.../protocol/http/api/TestRobotRulesParser.java | 123 +++
nutch-plugins/lib-nekohtml/build.xml | 30 +
nutch-plugins/lib-nekohtml/ivy.xml | 42 +
nutch-plugins/lib-nekohtml/plugin.xml | 38 +
nutch-plugins/lib-nekohtml/pom.xml | 38 +
nutch-plugins/lib-regex-filter/build.xml | 22 +
nutch-plugins/lib-regex-filter/ivy.xml | 41 +
nutch-plugins/lib-regex-filter/plugin.xml | 33 +
nutch-plugins/lib-regex-filter/pom.xml | 38 +
.../apache/nutch/urlfilter/api/RegexRule.java | 102 +++
.../nutch/urlfilter/api/RegexURLFilterBase.java | 315 ++++++++
.../nutch/urlfilter/api/package-info.java | 23 +
.../urlfilter/api/RegexURLFilterBaseTest.java | 134 ++++
nutch-plugins/lib-selenium/build-ivy.xml | 54 ++
nutch-plugins/lib-selenium/build.xml | 28 +
.../lib-selenium/howto_upgrade_selenium.txt | 15 +
nutch-plugins/lib-selenium/ivy.xml | 52 ++
nutch-plugins/lib-selenium/plugin.xml | 175 ++++
nutch-plugins/lib-selenium/pom.xml | 49 ++
.../nutch/protocol/selenium/HttpWebClient.java | 236 ++++++
nutch-plugins/lib-xml/build.xml | 36 +
nutch-plugins/lib-xml/ivy.xml | 44 +
nutch-plugins/lib-xml/plugin.xml | 65 ++
nutch-plugins/lib-xml/pom.xml | 38 +
nutch-plugins/microformats-reltag/build.xml | 27 +
nutch-plugins/microformats-reltag/ivy.xml | 41 +
nutch-plugins/microformats-reltag/plugin.xml | 49 ++
nutch-plugins/microformats-reltag/pom.xml | 38 +
.../reltag/RelTagIndexingFilter.java | 77 ++
.../nutch/microformats/reltag/RelTagParser.java | 148 ++++
.../nutch/microformats/reltag/package.html | 8 +
nutch-plugins/mimetype-filter/build.xml | 28 +
nutch-plugins/mimetype-filter/ivy.xml | 41 +
nutch-plugins/mimetype-filter/plugin.xml | 37 +
nutch-plugins/mimetype-filter/pom.xml | 38 +
.../mimetype-filter/sample/allow-images.txt | 34 +
.../mimetype-filter/sample/block-html.txt | 34 +
.../indexer/filter/MimeTypeIndexingFilter.java | 273 +++++++
.../filter/MimeTypeIndexingFilterTest.java | 114 +++
nutch-plugins/nutch-extensionpoints/build.xml | 30 +
nutch-plugins/nutch-extensionpoints/ivy.xml | 41 +
nutch-plugins/nutch-extensionpoints/plugin.xml | 67 ++
nutch-plugins/nutch-extensionpoints/pom.xml | 38 +
nutch-plugins/parse-ext/build.xml | 32 +
nutch-plugins/parse-ext/command | 24 +
nutch-plugins/parse-ext/ivy.xml | 41 +
nutch-plugins/parse-ext/plugin.xml | 60 ++
nutch-plugins/parse-ext/pom.xml | 38 +
.../org/apache/nutch/parse/ext/ExtParser.java | 183 +++++
.../apache/nutch/parse/ext/package-info.java | 22 +
.../apache/nutch/parse/ext/TestExtParser.java | 130 +++
nutch-plugins/parse-html/build.xml | 40 +
nutch-plugins/parse-html/ivy.xml | 42 +
nutch-plugins/parse-html/plugin.xml | 48 ++
nutch-plugins/parse-html/pom.xml | 49 ++
.../org/apache/nutch/parse/html/DOMBuilder.java | 766 ++++++++++++++++++
.../nutch/parse/html/DOMContentUtils.java | 400 ++++++++++
.../nutch/parse/html/HTMLMetaProcessor.java | 214 +++++
.../org/apache/nutch/parse/html/HtmlParser.java | 352 ++++++++
.../parse/html/XMLCharacterRecognizer.java | 112 +++
.../org/apache/nutch/parse/html/package.html | 5 +
.../nutch/parse/html/TestDOMContentUtils.java | 347 ++++++++
.../apache/nutch/parse/html/TestHtmlParser.java | 122 +++
.../parse/html/TestRobotsMetaProcessor.java | 155 ++++
nutch-plugins/parse-js/build.xml | 22 +
nutch-plugins/parse-js/ivy.xml | 41 +
nutch-plugins/parse-js/plugin.xml | 53 ++
nutch-plugins/parse-js/pom.xml | 38 +
.../apache/nutch/parse/js/JSParseFilter.java | 301 +++++++
.../org/apache/nutch/parse/js/package-info.java | 23 +
nutch-plugins/parse-metatags/README.txt | 17 +
nutch-plugins/parse-metatags/build.xml | 37 +
nutch-plugins/parse-metatags/ivy.xml | 41 +
nutch-plugins/parse-metatags/plugin.xml | 22 +
nutch-plugins/parse-metatags/pom.xml | 38 +
.../parse-metatags/sample/testMetatags.html | 9 +
.../sample/testMultivalueMetatags.html | 12 +
.../nutch/parse/metatags/MetaTagsParser.java | 124 +++
.../nutch/parse/metatags/package-info.java | 24 +
.../nutch/parse/metatags/TestMetatagParser.java | 104 +++
nutch-plugins/parse-replace/README.txt | 91 +++
nutch-plugins/parse-replace/build.xml | 37 +
nutch-plugins/parse-replace/ivy.xml | 41 +
nutch-plugins/parse-replace/plugin.xml | 22 +
nutch-plugins/parse-replace/pom.xml | 38 +
.../parse-replace/sample/testParseReplace.html | 11 +
.../nutch/parse/replace/ReplaceParser.java | 74 ++
.../nutch/parse/replace/package-info.java | 22 +
.../nutch/parse/replace/TestParseReplace.java | 68 ++
nutch-plugins/parse-swf/build.xml | 38 +
nutch-plugins/parse-swf/ivy.xml | 41 +
nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt | 33 +
nutch-plugins/parse-swf/lib/javaswf.jar | Bin 0 -> 125369 bytes
nutch-plugins/parse-swf/plugin.xml | 44 +
nutch-plugins/parse-swf/pom.xml | 46 ++
nutch-plugins/parse-swf/sample/test1.swf | Bin 0 -> 21054 bytes
nutch-plugins/parse-swf/sample/test1.txt | 60 ++
nutch-plugins/parse-swf/sample/test2.swf | Bin 0 -> 42534 bytes
nutch-plugins/parse-swf/sample/test2.txt | 5 +
nutch-plugins/parse-swf/sample/test3.swf | Bin 0 -> 51562 bytes
nutch-plugins/parse-swf/sample/test3.txt | 11 +
.../org/apache/nutch/parse/swf/SWFParser.java | 685 ++++++++++++++++
.../apache/nutch/parse/swf/package-info.java | 22 +
.../apache/nutch/parse/swf/TestSWFParser.java | 94 +++
nutch-plugins/parse-tika/build-ivy.xml | 54 ++
nutch-plugins/parse-tika/build.xml | 55 ++
nutch-plugins/parse-tika/howto_upgrade_tika.txt | 8 +
nutch-plugins/parse-tika/ivy.xml | 46 ++
nutch-plugins/parse-tika/plugin.xml | 136 ++++
nutch-plugins/parse-tika/pom.xml | 45 ++
nutch-plugins/parse-tika/sample/encrypted.pdf | Bin 0 -> 3431 bytes
nutch-plugins/parse-tika/sample/nutch.html | 519 ++++++++++++
.../parse-tika/sample/nutch_logo_tm.gif | Bin 0 -> 2747 bytes
nutch-plugins/parse-tika/sample/ootest.odt | Bin 0 -> 20753 bytes
nutch-plugins/parse-tika/sample/ootest.sxw | Bin 0 -> 20125 bytes
nutch-plugins/parse-tika/sample/ootest.txt | 30 +
nutch-plugins/parse-tika/sample/pdftest.pdf | 157 ++++
nutch-plugins/parse-tika/sample/rsstest.rss | 37 +
nutch-plugins/parse-tika/sample/test.rtf | 17 +
nutch-plugins/parse-tika/sample/word97.doc | Bin 0 -> 8192 bytes
.../tika/BoilerpipeExtractorRepository.java | 62 ++
.../org/apache/nutch/parse/tika/DOMBuilder.java | 794 +++++++++++++++++++
.../nutch/parse/tika/DOMContentUtils.java | 402 ++++++++++
.../nutch/parse/tika/HTMLMetaProcessor.java | 214 +++++
.../org/apache/nutch/parse/tika/TikaParser.java | 286 +++++++
.../parse/tika/XMLCharacterRecognizer.java | 112 +++
.../apache/nutch/parse/tika/package-info.java | 23 +
.../apache/nutch/tika/TestDOMContentUtils.java | 337 ++++++++
.../org/apache/nutch/tika/TestFeedParser.java | 121 +++
.../apache/nutch/tika/TestImageMetadata.java | 67 ++
.../org/apache/nutch/tika/TestMSWordParser.java | 92 +++
.../org/apache/nutch/tika/TestOOParser.java | 107 +++
.../org/apache/nutch/tika/TestPdfParser.java | 73 ++
.../org/apache/nutch/tika/TestRTFParser.java | 81 ++
.../nutch/tika/TestRobotsMetaProcessor.java | 156 ++++
nutch-plugins/parse-zip/build.xml | 38 +
nutch-plugins/parse-zip/ivy.xml | 41 +
nutch-plugins/parse-zip/plugin.xml | 46 ++
nutch-plugins/parse-zip/pom.xml | 38 +
nutch-plugins/parse-zip/sample/test.zip | Bin 0 -> 182 bytes
.../org/apache/nutch/parse/zip/ZipParser.java | 144 ++++
.../nutch/parse/zip/ZipTextExtractor.java | 120 +++
.../apache/nutch/parse/zip/package-info.java | 22 +
.../apache/nutch/parse/zip/TestZipParser.java | 71 ++
.../parsefilter-naivebayes/build-ivy.xml | 54 ++
nutch-plugins/parsefilter-naivebayes/build.xml | 22 +
nutch-plugins/parsefilter-naivebayes/ivy.xml | 49 ++
nutch-plugins/parsefilter-naivebayes/plugin.xml | 56 ++
nutch-plugins/parsefilter-naivebayes/pom.xml | 38 +
.../nutch/parsefilter/naivebayes/Classify.java | 120 +++
.../naivebayes/NaiveBayesParseFilter.java | 197 +++++
.../nutch/parsefilter/naivebayes/Train.java | 148 ++++
.../parsefilter/naivebayes/package-info.java | 28 +
nutch-plugins/parsefilter-regex/build.xml | 27 +
.../data/regex-parsefilter.txt | 10 +
nutch-plugins/parsefilter-regex/ivy.xml | 37 +
nutch-plugins/parsefilter-regex/plugin.xml | 42 +
nutch-plugins/parsefilter-regex/pom.xml | 38 +
.../parsefilter/regex/RegexParseFilter.java | 199 +++++
.../nutch/parsefilter/regex/package-info.java | 23 +
.../parsefilter/regex/TestRegexParseFilter.java | 77 ++
nutch-plugins/plugin.dtd | 206 +++++
nutch-plugins/plugin/pom.xml | 38 +
nutch-plugins/pom.xml | 94 ++-
nutch-plugins/protocol-file/build.xml | 29 +
nutch-plugins/protocol-file/ivy.xml | 41 +
nutch-plugins/protocol-file/plugin.xml | 46 ++
nutch-plugins/protocol-file/pom.xml | 38 +
.../protocol-file/sample/testprotocolfile.txt | 1 +
.../sample/testprotocolfile_(encoded).txt | 1 +
.../org/apache/nutch/protocol/file/File.java | 228 ++++++
.../apache/nutch/protocol/file/FileError.java | 36 +
.../nutch/protocol/file/FileException.java | 40 +
.../nutch/protocol/file/FileResponse.java | 317 ++++++++
.../org/apache/nutch/protocol/file/package.html | 5 +
.../nutch/protocol/file/TestProtocolFile.java | 99 +++
nutch-plugins/protocol-ftp/build.xml | 22 +
nutch-plugins/protocol-ftp/ivy.xml | 42 +
nutch-plugins/protocol-ftp/plugin.xml | 46 ++
nutch-plugins/protocol-ftp/pom.xml | 38 +
.../org/apache/nutch/protocol/ftp/Client.java | 595 ++++++++++++++
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 +++++++
.../org/apache/nutch/protocol/ftp/FtpError.java | 36 +
.../apache/nutch/protocol/ftp/FtpException.java | 46 ++
.../ftp/FtpExceptionBadSystResponse.java | 29 +
.../FtpExceptionCanNotHaveDataConnection.java | 29 +
...ExceptionControlClosedByForcedDataClose.java | 30 +
.../ftp/FtpExceptionUnknownForcedDataClose.java | 30 +
.../apache/nutch/protocol/ftp/FtpResponse.java | 521 ++++++++++++
.../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 +++
.../protocol/ftp/PrintCommandListener.java | 71 ++
.../org/apache/nutch/protocol/ftp/package.html | 5 +
nutch-plugins/protocol-htmlunit/build.xml | 37 +
nutch-plugins/protocol-htmlunit/ivy.xml | 38 +
nutch-plugins/protocol-htmlunit/plugin.xml | 51 ++
nutch-plugins/protocol-htmlunit/pom.xml | 51 ++
.../apache/nutch/protocol/htmlunit/Http.java | 63 ++
.../nutch/protocol/htmlunit/HttpResponse.java | 573 +++++++++++++
.../apache/nutch/protocol/htmlunit/package.html | 21 +
nutch-plugins/protocol-http/build.xml | 50 ++
nutch-plugins/protocol-http/ivy.xml | 41 +
nutch-plugins/protocol-http/jsp/basic-http.jsp | 44 +
nutch-plugins/protocol-http/jsp/brokenpage.jsp | 47 ++
nutch-plugins/protocol-http/jsp/redirect301.jsp | 49 ++
nutch-plugins/protocol-http/jsp/redirect302.jsp | 49 ++
nutch-plugins/protocol-http/plugin.xml | 51 ++
nutch-plugins/protocol-http/pom.xml | 45 ++
.../org/apache/nutch/protocol/http/Http.java | 73 ++
.../nutch/protocol/http/HttpResponse.java | 558 +++++++++++++
.../org/apache/nutch/protocol/http/package.html | 5 +
.../src/test/conf/nutch-site-test.xml | 52 ++
.../nutch/protocol/http/TestProtocolHttp.java | 140 ++++
nutch-plugins/protocol-httpclient/build.xml | 45 ++
nutch-plugins/protocol-httpclient/ivy.xml | 42 +
nutch-plugins/protocol-httpclient/jsp/basic.jsp | 74 ++
.../protocol-httpclient/jsp/cookies.jsp | 63 ++
.../protocol-httpclient/jsp/digest.jsp | 68 ++
.../protocol-httpclient/jsp/noauth.jsp | 36 +
nutch-plugins/protocol-httpclient/jsp/ntlm.jsp | 89 +++
nutch-plugins/protocol-httpclient/plugin.xml | 58 ++
nutch-plugins/protocol-httpclient/pom.xml | 50 ++
.../DummySSLProtocolSocketFactory.java | 163 ++++
.../httpclient/DummyX509TrustManager.java | 92 +++
.../apache/nutch/protocol/httpclient/Http.java | 572 +++++++++++++
.../protocol/httpclient/HttpAuthentication.java | 45 ++
.../httpclient/HttpAuthenticationException.java | 71 ++
.../httpclient/HttpAuthenticationFactory.java | 98 +++
.../httpclient/HttpBasicAuthentication.java | 199 +++++
.../httpclient/HttpFormAuthConfigurer.java | 106 +++
.../httpclient/HttpFormAuthentication.java | 223 ++++++
.../nutch/protocol/httpclient/HttpResponse.java | 216 +++++
.../nutch/protocol/httpclient/package.html | 9 +
.../src/test/conf/httpclient-auth-test.xml | 58 ++
.../src/test/conf/nutch-site-test.xml | 52 ++
.../httpclient/TestProtocolHttpClient.java | 217 +++++
.../protocol-interactiveselenium/README.md | 38 +
.../protocol-interactiveselenium/build-ivy.xml | 54 ++
.../protocol-interactiveselenium/build.xml | 37 +
.../protocol-interactiveselenium/ivy.xml | 42 +
.../protocol-interactiveselenium/plugin.xml | 47 ++
.../protocol-interactiveselenium/pom.xml | 50 ++
.../protocol/interactiveselenium/Http.java | 59 ++
.../interactiveselenium/HttpResponse.java | 399 ++++++++++
.../DefalultMultiInteractionHandler.java | 53 ++
.../DefaultClickAllAjaxLinksHandler.java | 88 ++
.../handlers/DefaultHandler.java | 30 +
.../handlers/InteractiveSeleniumHandler.java | 25 +
.../protocol/interactiveselenium/package.html | 5 +
nutch-plugins/protocol-selenium/README.md | 208 +++++
nutch-plugins/protocol-selenium/build-ivy.xml | 54 ++
nutch-plugins/protocol-selenium/build.xml | 36 +
nutch-plugins/protocol-selenium/ivy.xml | 42 +
nutch-plugins/protocol-selenium/plugin.xml | 47 ++
nutch-plugins/protocol-selenium/pom.xml | 50 ++
.../apache/nutch/protocol/selenium/Http.java | 59 ++
.../nutch/protocol/selenium/HttpResponse.java | 360 +++++++++
.../apache/nutch/protocol/selenium/package.html | 5 +
nutch-plugins/scoring-depth/build.xml | 6 +
nutch-plugins/scoring-depth/ivy.xml | 41 +
nutch-plugins/scoring-depth/plugin.xml | 24 +
nutch-plugins/scoring-depth/pom.xml | 38 +
.../nutch/scoring/depth/DepthScoringFilter.java | 207 +++++
.../nutch/scoring/depth/package-info.java | 23 +
nutch-plugins/scoring-link/build.xml | 27 +
nutch-plugins/scoring-link/ivy.xml | 41 +
nutch-plugins/scoring-link/plugin.xml | 39 +
nutch-plugins/scoring-link/pom.xml | 38 +
.../scoring/link/LinkAnalysisScoringFilter.java | 95 +++
.../apache/nutch/scoring/link/package-info.java | 23 +
nutch-plugins/scoring-opic/build.xml | 27 +
nutch-plugins/scoring-opic/ivy.xml | 41 +
nutch-plugins/scoring-opic/plugin.xml | 39 +
nutch-plugins/scoring-opic/pom.xml | 38 +
.../nutch/scoring/opic/OPICScoringFilter.java | 173 ++++
.../apache/nutch/scoring/opic/package-info.java | 23 +
nutch-plugins/scoring-similarity/build-ivy.xml | 54 ++
nutch-plugins/scoring-similarity/build.xml | 27 +
nutch-plugins/scoring-similarity/ivy.xml | 42 +
nutch-plugins/scoring-similarity/plugin.xml | 45 ++
nutch-plugins/scoring-similarity/pom.xml | 45 ++
.../scoring/similarity/SimilarityModel.java | 38 +
.../similarity/SimilarityScoringFilter.java | 70 ++
.../similarity/cosine/CosineSimilarity.java | 84 ++
.../scoring/similarity/cosine/DocVector.java | 57 ++
.../nutch/scoring/similarity/cosine/Model.java | 190 +++++
.../scoring/similarity/cosine/package-info.java | 7 +
.../similarity/util/LuceneAnalyzerUtil.java | 93 +++
.../similarity/util/LuceneTokenizer.java | 166 ++++
.../scoring/similarity/util/package-info.java | 24 +
nutch-plugins/subcollection/README.txt | 10 +
nutch-plugins/subcollection/build.xml | 22 +
nutch-plugins/subcollection/ivy.xml | 41 +
nutch-plugins/subcollection/plugin.xml | 41 +
nutch-plugins/subcollection/pom.xml | 38 +
.../nutch/collection/CollectionManager.java | 240 ++++++
.../apache/nutch/collection/Subcollection.java | 259 ++++++
.../org/apache/nutch/collection/package.html | 36 +
.../SubcollectionIndexingFilter.java | 101 +++
.../indexer/subcollection/package-info.java | 25 +
.../nutch/collection/TestSubcollection.java | 112 +++
nutch-plugins/tld/build.xml | 22 +
nutch-plugins/tld/ivy.xml | 41 +
nutch-plugins/tld/plugin.xml | 51 ++
nutch-plugins/tld/pom.xml | 38 +
.../nutch/indexer/tld/TLDIndexingFilter.java | 69 ++
.../org/apache/nutch/indexer/tld/package.html | 5 +
.../nutch/scoring/tld/TLDScoringFilter.java | 114 +++
.../org/apache/nutch/scoring/tld/package.html | 5 +
nutch-plugins/urlfilter-automaton/build.xml | 51 ++
nutch-plugins/urlfilter-automaton/ivy.xml | 42 +
nutch-plugins/urlfilter-automaton/plugin.xml | 43 +
nutch-plugins/urlfilter-automaton/pom.xml | 50 ++
.../urlfilter-automaton/sample/Benchmarks.rules | 26 +
.../urlfilter-automaton/sample/Benchmarks.urls | 297 +++++++
.../sample/IntranetCrawling.rules | 24 +
.../sample/IntranetCrawling.urls | 8 +
.../sample/WholeWebCrawling.rules | 19 +
.../sample/WholeWebCrawling.urls | 11 +
.../urlfilter/automaton/AutomatonURLFilter.java | 116 +++
.../nutch/urlfilter/automaton/package.html | 9 +
.../automaton/TestAutomatonURLFilter.java | 56 ++
nutch-plugins/urlfilter-domain/build.xml | 28 +
nutch-plugins/urlfilter-domain/data/hosts.txt | 5 +
nutch-plugins/urlfilter-domain/ivy.xml | 41 +
nutch-plugins/urlfilter-domain/plugin.xml | 43 +
nutch-plugins/urlfilter-domain/pom.xml | 38 +
.../nutch/urlfilter/domain/DomainURLFilter.java | 212 +++++
.../nutch/urlfilter/domain/package-info.java | 25 +
.../urlfilter/domain/TestDomainURLFilter.java | 67 ++
.../urlfilter-domainblacklist/build.xml | 28 +
.../urlfilter-domainblacklist/data/hosts.txt | 5 +
nutch-plugins/urlfilter-domainblacklist/ivy.xml | 41 +
.../urlfilter-domainblacklist/plugin.xml | 43 +
nutch-plugins/urlfilter-domainblacklist/pom.xml | 38 +
.../DomainBlacklistURLFilter.java | 210 +++++
.../urlfilter/domainblacklist/package-info.java | 24 +
.../TestDomainBlacklistURLFilter.java | 49 ++
nutch-plugins/urlfilter-ignoreexempt/README.md | 43 +
nutch-plugins/urlfilter-ignoreexempt/build.xml | 55 ++
.../urlfilter-ignoreexempt/data/.donotdelete | 0
nutch-plugins/urlfilter-ignoreexempt/ivy.xml | 41 +
nutch-plugins/urlfilter-ignoreexempt/plugin.xml | 45 ++
nutch-plugins/urlfilter-ignoreexempt/pom.xml | 45 ++
.../ignoreexempt/ExemptionUrlFilter.java | 101 +++
.../urlfilter/ignoreexempt/package-info.java | 24 +
nutch-plugins/urlfilter-prefix/build.xml | 22 +
nutch-plugins/urlfilter-prefix/ivy.xml | 41 +
nutch-plugins/urlfilter-prefix/plugin.xml | 47 ++
nutch-plugins/urlfilter-prefix/pom.xml | 38 +
.../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 +++++
.../apache/nutch/urlfilter/prefix/package.html | 5 +
.../urlfilter/prefix/TestPrefixURLFilter.java | 79 ++
nutch-plugins/urlfilter-regex/build.xml | 51 ++
nutch-plugins/urlfilter-regex/ivy.xml | 41 +
nutch-plugins/urlfilter-regex/plugin.xml | 48 ++
nutch-plugins/urlfilter-regex/pom.xml | 46 ++
.../urlfilter-regex/sample/Benchmarks.rules | 26 +
.../urlfilter-regex/sample/Benchmarks.urls | 297 +++++++
.../sample/IntranetCrawling.rules | 27 +
.../sample/IntranetCrawling.urls | 8 +
.../sample/WholeWebCrawling.rules | 22 +
.../sample/WholeWebCrawling.urls | 11 +
.../urlfilter-regex/sample/nutch1838.rules | 12 +
.../urlfilter-regex/sample/nutch1838.urls | 3 +
.../nutch/urlfilter/regex/RegexURLFilter.java | 111 +++
.../apache/nutch/urlfilter/regex/package.html | 5 +
.../urlfilter/regex/TestRegexURLFilter.java | 61 ++
nutch-plugins/urlfilter-suffix/build.xml | 22 +
nutch-plugins/urlfilter-suffix/ivy.xml | 41 +
nutch-plugins/urlfilter-suffix/plugin.xml | 47 ++
nutch-plugins/urlfilter-suffix/pom.xml | 38 +
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 ++++++++
.../nutch/urlfilter/suffix/package-info.java | 23 +
.../urlfilter/suffix/TestSuffixURLFilter.java | 123 +++
nutch-plugins/urlfilter-validator/build.xml | 22 +
nutch-plugins/urlfilter-validator/ivy.xml | 41 +
nutch-plugins/urlfilter-validator/plugin.xml | 41 +
nutch-plugins/urlfilter-validator/pom.xml | 38 +
.../nutch/urlfilter/validator/UrlValidator.java | 386 +++++++++
.../nutch/urlfilter/validator/package.html | 9 +
.../urlfilter/validator/TestUrlValidator.java | 79 ++
nutch-plugins/urlmeta/build.xml | 22 +
nutch-plugins/urlmeta/ivy.xml | 41 +
nutch-plugins/urlmeta/plugin.xml | 47 ++
nutch-plugins/urlmeta/pom.xml | 38 +
.../indexer/urlmeta/URLMetaIndexingFilter.java | 118 +++
.../apache/nutch/indexer/urlmeta/package.html | 12 +
.../scoring/urlmeta/URLMetaScoringFilter.java | 175 ++++
.../apache/nutch/scoring/urlmeta/package.html | 11 +
nutch-plugins/urlnormalizer-ajax/build.xml | 22 +
nutch-plugins/urlnormalizer-ajax/ivy.xml | 41 +
nutch-plugins/urlnormalizer-ajax/plugin.xml | 41 +
nutch-plugins/urlnormalizer-ajax/pom.xml | 38 +
.../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 ++++++
.../ajax/TestAjaxURLNormalizer.java | 67 ++
nutch-plugins/urlnormalizer-basic/build.xml | 22 +
nutch-plugins/urlnormalizer-basic/ivy.xml | 41 +
nutch-plugins/urlnormalizer-basic/plugin.xml | 41 +
nutch-plugins/urlnormalizer-basic/pom.xml | 38 +
.../urlnormalizer/basic/BasicURLNormalizer.java | 290 +++++++
.../net/urlnormalizer/basic/package-info.java | 23 +
.../basic/TestBasicURLNormalizer.java | 175 ++++
nutch-plugins/urlnormalizer-host/build.xml | 27 +
nutch-plugins/urlnormalizer-host/data/hosts.txt | 8 +
nutch-plugins/urlnormalizer-host/ivy.xml | 41 +
nutch-plugins/urlnormalizer-host/plugin.xml | 43 +
nutch-plugins/urlnormalizer-host/pom.xml | 38 +
.../urlnormalizer/host/HostURLNormalizer.java | 198 +++++
.../net/urlnormalizer/host/package-info.java | 23 +
.../host/TestHostURLNormalizer.java | 57 ++
nutch-plugins/urlnormalizer-pass/build.xml | 22 +
nutch-plugins/urlnormalizer-pass/ivy.xml | 41 +
nutch-plugins/urlnormalizer-pass/plugin.xml | 41 +
nutch-plugins/urlnormalizer-pass/pom.xml | 38 +
.../urlnormalizer/pass/PassURLNormalizer.java | 49 ++
.../net/urlnormalizer/pass/package-info.java | 23 +
.../pass/TestPassURLNormalizer.java | 45 ++
nutch-plugins/urlnormalizer-protocol/build.xml | 27 +
.../urlnormalizer-protocol/data/protocols.txt | 7 +
nutch-plugins/urlnormalizer-protocol/ivy.xml | 41 +
nutch-plugins/urlnormalizer-protocol/plugin.xml | 43 +
nutch-plugins/urlnormalizer-protocol/pom.xml | 38 +
.../protocol/ProtocolURLNormalizer.java | 190 +++++
.../protocol/TestProtocolURLNormalizer.java | 55 ++
.../urlnormalizer-querystring/build.xml | 22 +
nutch-plugins/urlnormalizer-querystring/ivy.xml | 41 +
.../urlnormalizer-querystring/plugin.xml | 42 +
nutch-plugins/urlnormalizer-querystring/pom.xml | 38 +
.../querystring/QuerystringURLNormalizer.java | 91 +++
.../urlnormalizer/querystring/package-info.java | 23 +
.../TestQuerystringURLNormalizer.java | 49 ++
nutch-plugins/urlnormalizer-regex/build.xml | 34 +
nutch-plugins/urlnormalizer-regex/ivy.xml | 41 +
nutch-plugins/urlnormalizer-regex/plugin.xml | 41 +
nutch-plugins/urlnormalizer-regex/pom.xml | 38 +
.../sample/regex-normalize-default.test | 84 ++
.../sample/regex-normalize-default.xml | 66 ++
.../sample/regex-normalize-scope1.test | 8 +
.../sample/regex-normalize-scope1.xml | 21 +
.../urlnormalizer/regex/RegexURLNormalizer.java | 324 ++++++++
.../net/urlnormalizer/regex/package-info.java | 23 +
.../regex/TestRegexURLNormalizer.java | 186 +++++
nutch-plugins/urlnormalizer-slash/build.xml | 27 +
.../urlnormalizer-slash/data/slashes.txt | 7 +
nutch-plugins/urlnormalizer-slash/ivy.xml | 41 +
nutch-plugins/urlnormalizer-slash/plugin.xml | 43 +
nutch-plugins/urlnormalizer-slash/pom.xml | 38 +
.../urlnormalizer/slash/SlashURLNormalizer.java | 224 ++++++
.../slash/TestSlashURLNormalizer.java | 73 ++
pom.xml | 22 +-
src/plugin/build-plugin.xml | 255 ------
src/plugin/build.xml | 213 -----
src/plugin/creativecommons/README.txt | 1 -
src/plugin/creativecommons/build.xml | 28 -
.../creativecommons/conf/crawl-urlfilter.txt | 18 -
src/plugin/creativecommons/conf/nutch-site.xml | 50 --
src/plugin/creativecommons/data/anchor.html | 9 -
src/plugin/creativecommons/data/rdf.html | 35 -
src/plugin/creativecommons/data/rel.html | 6 -
src/plugin/creativecommons/ivy.xml | 41 -
src/plugin/creativecommons/plugin.xml | 48 --
.../creativecommons/nutch/CCIndexingFilter.java | 124 ---
.../creativecommons/nutch/CCParseFilter.java | 300 -------
.../java/org/creativecommons/nutch/package.html | 5 -
.../nutch/TestCCParseFilter.java | 73 --
src/plugin/feed/build.xml | 45 --
src/plugin/feed/ivy.xml | 43 -
src/plugin/feed/plugin.xml | 49 --
src/plugin/feed/sample/rsstest.rss | 36 -
.../nutch/indexer/feed/FeedIndexingFilter.java | 129 ---
.../apache/nutch/indexer/feed/package-info.java | 22 -
.../org/apache/nutch/parse/feed/FeedParser.java | 374 ---------
.../apache/nutch/parse/feed/package-info.java | 22 -
.../apache/nutch/parse/feed/TestFeedParser.java | 124 ---
src/plugin/headings/build.xml | 22 -
src/plugin/headings/ivy.xml | 41 -
src/plugin/headings/plugin.xml | 45 --
.../parse/headings/HeadingsParseFilter.java | 124 ---
.../nutch/parse/headings/package-info.java | 22 -
src/plugin/index-anchor/build.xml | 22 -
src/plugin/index-anchor/ivy.xml | 41 -
src/plugin/index-anchor/plugin.xml | 38 -
.../indexer/anchor/AnchorIndexingFilter.java | 107 ---
.../apache/nutch/indexer/anchor/package.html | 5 -
.../anchor/TestAnchorIndexingFilter.java | 67 --
src/plugin/index-basic/build.xml | 22 -
src/plugin/index-basic/ivy.xml | 41 -
src/plugin/index-basic/plugin.xml | 42 -
.../indexer/basic/BasicIndexingFilter.java | 158 ----
.../org/apache/nutch/indexer/basic/package.html | 5 -
.../indexer/basic/TestBasicIndexingFilter.java | 99 ---
src/plugin/index-geoip/build-ivy.xml | 54 --
src/plugin/index-geoip/build.xml | 27 -
src/plugin/index-geoip/ivy.xml | 46 --
src/plugin/index-geoip/plugin.xml | 51 --
.../indexer/geoip/GeoIPDocumentCreator.java | 210 -----
.../indexer/geoip/GeoIPIndexingFilter.java | 241 ------
.../nutch/indexer/geoip/package-info.java | 28 -
src/plugin/index-links/build.xml | 22 -
src/plugin/index-links/ivy.xml | 41 -
src/plugin/index-links/plugin.xml | 41 -
.../indexer/links/LinksIndexingFilter.java | 167 ----
.../indexer/links/TestLinksIndexingFilter.java | 218 -----
.../org/apache/nutch/parse/TestOutlinks.java | 54 --
src/plugin/index-metadata/build.xml | 22 -
src/plugin/index-metadata/ivy.xml | 41 -
src/plugin/index-metadata/plugin.xml | 42 -
.../nutch/indexer/metadata/MetadataIndexer.java | 104 ---
.../nutch/indexer/metadata/package-info.java | 23 -
src/plugin/index-more/build.xml | 22 -
src/plugin/index-more/ivy.xml | 41 -
src/plugin/index-more/plugin.xml | 42 -
.../nutch/indexer/more/MoreIndexingFilter.java | 344 --------
.../org/apache/nutch/indexer/more/package.html | 6 -
.../indexer/more/TestMoreIndexingFilter.java | 123 ---
src/plugin/index-replace/README.txt | 95 ---
src/plugin/index-replace/build.xml | 55 --
src/plugin/index-replace/ivy.xml | 41 -
src/plugin/index-replace/plugin.xml | 22 -
.../index-replace/sample/testIndexReplace.html | 12 -
.../nutch/indexer/replace/FieldReplacer.java | 196 -----
.../nutch/indexer/replace/ReplaceIndexer.java | 330 --------
.../nutch/indexer/replace/package-info.java | 22 -
.../nutch/indexer/replace/TestIndexReplace.java | 456 -----------
src/plugin/index-static/build.xml | 22 -
src/plugin/index-static/ivy.xml | 41 -
src/plugin/index-static/plugin.xml | 42 -
.../indexer/staticfield/StaticFieldIndexer.java | 143 ----
.../nutch/indexer/staticfield/package.html | 5 -
.../staticfield/TestStaticFieldIndexerTest.java | 194 -----
src/plugin/indexer-cloudsearch/README.md | 58 --
src/plugin/indexer-cloudsearch/build.xml | 22 -
.../indexer-cloudsearch/createCSDomain.sh | 22 -
src/plugin/indexer-cloudsearch/ivy.xml | 41 -
src/plugin/indexer-cloudsearch/plugin.xml | 50 --
.../cloudsearch/CloudSearchConstants.java | 27 -
.../cloudsearch/CloudSearchIndexWriter.java | 382 ---------
.../cloudsearch/CloudSearchUtils.java | 73 --
src/plugin/indexer-dummy/build.xml | 22 -
src/plugin/indexer-dummy/ivy.xml | 41 -
src/plugin/indexer-dummy/plugin.xml | 38 -
.../indexwriter/dummy/DummyIndexWriter.java | 103 ---
.../nutch/indexwriter/dummy/package-info.java | 23 -
src/plugin/indexer-elastic/build-ivy.xml | 54 --
src/plugin/indexer-elastic/build.xml | 22 -
src/plugin/indexer-elastic/howto_upgrade_es.txt | 6 -
src/plugin/indexer-elastic/ivy.xml | 43 -
src/plugin/indexer-elastic/plugin.xml | 71 --
.../indexwriter/elastic/ElasticConstants.java | 28 -
.../indexwriter/elastic/ElasticIndexWriter.java | 279 -------
.../nutch/indexwriter/elastic/package-info.java | 22 -
src/plugin/indexer-solr/build-ivy.xml | 54 --
src/plugin/indexer-solr/build.xml | 22 -
src/plugin/indexer-solr/ivy.xml | 44 -
src/plugin/indexer-solr/plugin.xml | 48 --
.../nutch/indexwriter/solr/SolrConstants.java | 56 --
.../nutch/indexwriter/solr/SolrIndexWriter.java | 277 -------
.../indexwriter/solr/SolrMappingReader.java | 147 ----
.../nutch/indexwriter/solr/SolrUtils.java | 97 ---
.../nutch/indexwriter/solr/package-info.java | 22 -
src/plugin/language-identifier/build.xml | 38 -
src/plugin/language-identifier/ivy.xml | 41 -
src/plugin/language-identifier/plugin.xml | 49 --
.../nutch/analysis/lang/HTMLLanguageParser.java | 320 --------
.../analysis/lang/LanguageIndexingFilter.java | 89 ---
.../nutch/analysis/lang/langmappings.properties | 188 -----
.../org/apache/nutch/analysis/lang/package.html | 6 -
.../analysis/lang/TestHTMLLanguageParser.java | 149 ----
.../test/org/apache/nutch/analysis/lang/da.test | 108 ---
.../test/org/apache/nutch/analysis/lang/de.test | 104 ---
.../test/org/apache/nutch/analysis/lang/el.test | 109 ---
.../test/org/apache/nutch/analysis/lang/en.test | 105 ---
.../test/org/apache/nutch/analysis/lang/es.test | 107 ---
.../test/org/apache/nutch/analysis/lang/fi.test | 106 ---
.../test/org/apache/nutch/analysis/lang/fr.test | 105 ---
.../test/org/apache/nutch/analysis/lang/it.test | 109 ---
.../test/org/apache/nutch/analysis/lang/nl.test | 105 ---
.../test/org/apache/nutch/analysis/lang/pt.test | 105 ---
.../test/org/apache/nutch/analysis/lang/sv.test | 108 ---
.../nutch/analysis/lang/test-referencial.txt | 10 -
src/plugin/lib-htmlunit/build-ivy.xml | 54 --
src/plugin/lib-htmlunit/build.xml | 28 -
src/plugin/lib-htmlunit/ivy.xml | 52 --
src/plugin/lib-htmlunit/plugin.xml | 166 ----
.../protocol/htmlunit/HtmlUnitWebDriver.java | 189 -----
.../htmlunit/HtmlUnitWebWindowListener.java | 53 --
src/plugin/lib-http/build.xml | 22 -
src/plugin/lib-http/ivy.xml | 41 -
src/plugin/lib-http/plugin.xml | 33 -
.../protocol/http/api/BlockedException.java | 26 -
.../nutch/protocol/http/api/HttpBase.java | 587 --------------
.../nutch/protocol/http/api/HttpException.java | 40 -
.../protocol/http/api/HttpRobotRulesParser.java | 167 ----
.../apache/nutch/protocol/http/api/package.html | 6 -
.../protocol/http/api/TestRobotRulesParser.java | 123 ---
src/plugin/lib-nekohtml/build.xml | 30 -
src/plugin/lib-nekohtml/ivy.xml | 42 -
src/plugin/lib-nekohtml/plugin.xml | 38 -
src/plugin/lib-regex-filter/build.xml | 22 -
src/plugin/lib-regex-filter/ivy.xml | 41 -
src/plugin/lib-regex-filter/plugin.xml | 33 -
.../apache/nutch/urlfilter/api/RegexRule.java | 102 ---
.../nutch/urlfilter/api/RegexURLFilterBase.java | 315 --------
.../nutch/urlfilter/api/package-info.java | 23 -
.../urlfilter/api/RegexURLFilterBaseTest.java | 134 ----
src/plugin/lib-selenium/build-ivy.xml | 54 --
src/plugin/lib-selenium/build.xml | 28 -
.../lib-selenium/howto_upgrade_selenium.txt | 15 -
src/plugin/lib-selenium/ivy.xml | 52 --
src/plugin/lib-selenium/plugin.xml | 175 ----
.../nutch/protocol/selenium/HttpWebClient.java | 236 ------
src/plugin/lib-xml/build.xml | 36 -
src/plugin/lib-xml/ivy.xml | 44 -
src/plugin/lib-xml/plugin.xml | 65 --
src/plugin/microformats-reltag/build.xml | 27 -
src/plugin/microformats-reltag/ivy.xml | 41 -
src/plugin/microformats-reltag/plugin.xml | 49 --
.../reltag/RelTagIndexingFilter.java | 77 --
.../nutch/microformats/reltag/RelTagParser.java | 148 ----
.../nutch/microformats/reltag/package.html | 8 -
src/plugin/mimetype-filter/build.xml | 28 -
src/plugin/mimetype-filter/ivy.xml | 41 -
src/plugin/mimetype-filter/plugin.xml | 37 -
.../mimetype-filter/sample/allow-images.txt | 34 -
.../mimetype-filter/sample/block-html.txt | 34 -
.../indexer/filter/MimeTypeIndexingFilter.java | 273 -------
.../filter/MimeTypeIndexingFilterTest.java | 114 ---
src/plugin/nutch-extensionpoints/build.xml | 30 -
src/plugin/nutch-extensionpoints/ivy.xml | 41 -
src/plugin/nutch-extensionpoints/plugin.xml | 67 --
src/plugin/parse-ext/build.xml | 32 -
src/plugin/parse-ext/command | 24 -
src/plugin/parse-ext/ivy.xml | 41 -
src/plugin/parse-ext/plugin.xml | 60 --
.../org/apache/nutch/parse/ext/ExtParser.java | 183 -----
.../apache/nutch/parse/ext/package-info.java | 22 -
.../apache/nutch/parse/ext/TestExtParser.java | 130 ---
src/plugin/parse-html/build.xml | 40 -
src/plugin/parse-html/ivy.xml | 42 -
src/plugin/parse-html/plugin.xml | 48 --
.../org/apache/nutch/parse/html/DOMBuilder.java | 766 ------------------
.../nutch/parse/html/DOMContentUtils.java | 400 ----------
.../nutch/parse/html/HTMLMetaProcessor.java | 214 -----
.../org/apache/nutch/parse/html/HtmlParser.java | 352 --------
.../parse/html/XMLCharacterRecognizer.java | 112 ---
.../org/apache/nutch/parse/html/package.html | 5 -
.../nutch/parse/html/TestDOMContentUtils.java | 347 --------
.../apache/nutch/parse/html/TestHtmlParser.java | 122 ---
.../parse/html/TestRobotsMetaProcessor.java | 155 ----
src/plugin/parse-js/build.xml | 22 -
src/plugin/parse-js/ivy.xml | 41 -
src/plugin/parse-js/plugin.xml | 53 --
.../apache/nutch/parse/js/JSParseFilter.java | 301 -------
.../org/apache/nutch/parse/js/package-info.java | 23 -
src/plugin/parse-metatags/README.txt | 17 -
src/plugin/parse-metatags/build.xml | 37 -
src/plugin/parse-metatags/ivy.xml | 41 -
src/plugin/parse-metatags/plugin.xml | 22 -
.../parse-metatags/sample/testMetatags.html | 9 -
.../sample/testMultivalueMetatags.html | 12 -
.../nutch/parse/metatags/MetaTagsParser.java | 124 ---
.../nutch/parse/metatags/package-info.java | 24 -
.../nutch/parse/metatags/TestMetatagParser.java | 104 ---
src/plugin/parse-replace/README.txt | 91 ---
src/plugin/parse-replace/build.xml | 37 -
src/plugin/parse-replace/ivy.xml | 41 -
src/plugin/parse-replace/plugin.xml | 22 -
.../parse-replace/sample/testParseReplace.html | 11 -
.../nutch/parse/replace/ReplaceParser.java | 74 --
.../nutch/parse/replace/package-info.java | 22 -
.../nutch/parse/replace/TestParseReplace.java | 68 --
src/plugin/parse-swf/build.xml | 38 -
src/plugin/parse-swf/ivy.xml | 41 -
src/plugin/parse-swf/lib/javaswf-LICENSE.txt | 33 -
src/plugin/parse-swf/lib/javaswf.jar | Bin 125369 -> 0 bytes
src/plugin/parse-swf/plugin.xml | 44 -
src/plugin/parse-swf/sample/test1.swf | Bin 21054 -> 0 bytes
src/plugin/parse-swf/sample/test1.txt | 60 --
src/plugin/parse-swf/sample/test2.swf | Bin 42534 -> 0 bytes
src/plugin/parse-swf/sample/test2.txt | 5 -
src/plugin/parse-swf/sample/test3.swf | Bin 51562 -> 0 bytes
src/plugin/parse-swf/sample/test3.txt | 11 -
.../org/apache/nutch/parse/swf/SWFParser.java | 685 ----------------
.../apache/nutch/parse/swf/package-info.java | 22 -
.../apache/nutch/parse/swf/TestSWFParser.java | 94 ---
src/plugin/parse-tika/build-ivy.xml | 54 --
src/plugin/parse-tika/build.xml | 55 --
src/plugin/parse-tika/howto_upgrade_tika.txt | 8 -
src/plugin/parse-tika/ivy.xml | 46 --
src/plugin/parse-tika/plugin.xml | 136 ----
src/plugin/parse-tika/sample/encrypted.pdf | Bin 3431 -> 0 bytes
src/plugin/parse-tika/sample/nutch.html | 519 ------------
src/plugin/parse-tika/sample/nutch_logo_tm.gif | Bin 2747 -> 0 bytes
src/plugin/parse-tika/sample/ootest.odt | Bin 20753 -> 0 bytes
src/plugin/parse-tika/sample/ootest.sxw | Bin 20125 -> 0 bytes
src/plugin/parse-tika/sample/ootest.txt | 30 -
src/plugin/parse-tika/sample/pdftest.pdf | 157 ----
src/plugin/parse-tika/sample/rsstest.rss | 37 -
src/plugin/parse-tika/sample/test.rtf | 17 -
src/plugin/parse-tika/sample/word97.doc | Bin 8192 -> 0 bytes
.../tika/BoilerpipeExtractorRepository.java | 62 --
.../org/apache/nutch/parse/tika/DOMBuilder.java | 794 -------------------
.../nutch/parse/tika/DOMContentUtils.java | 402 ----------
.../nutch/parse/tika/HTMLMetaProcessor.java | 214 -----
.../org/apache/nutch/parse/tika/TikaParser.java | 286 -------
.../parse/tika/XMLCharacterRecognizer.java | 112 ---
.../apache/nutch/parse/tika/package-info.java | 23 -
.../apache/nutch/tika/TestDOMContentUtils.java | 337 --------
.../org/apache/nutch/tika/TestFeedParser.java | 121 ---
.../apache/nutch/tika/TestImageMetadata.java | 67 --
.../org/apache/nutch/tika/TestMSWordParser.java | 92 ---
.../org/apache/nutch/tika/TestOOParser.java | 107 ---
.../org/apache/nutch/tika/TestPdfParser.java | 73 --
.../org/apache/nutch/tika/TestRTFParser.java | 81 --
.../nutch/tika/TestRobotsMetaProcessor.java | 156 ----
src/plugin/parse-zip/build.xml | 38 -
src/plugin/parse-zip/ivy.xml | 41 -
src/plugin/parse-zip/plugin.xml | 46 --
src/plugin/parse-zip/sample/test.zip | Bin 182 -> 0 bytes
.../org/apache/nutch/parse/zip/ZipParser.java | 144 ----
.../nutch/parse/zip/ZipTextExtractor.java | 120 ---
.../apache/nutch/parse/zip/package-info.java | 22 -
.../apache/nutch/parse/zip/TestZipParser.java | 71 --
src/plugin/parsefilter-naivebayes/build-ivy.xml | 54 --
src/plugin/parsefilter-naivebayes/build.xml | 22 -
src/plugin/parsefilter-naivebayes/ivy.xml | 49 --
src/plugin/parsefilter-naivebayes/plugin.xml | 56 --
.../nutch/parsefilter/naivebayes/Classify.java | 120 ---
.../naivebayes/NaiveBayesParseFilter.java | 197 -----
.../nutch/parsefilter/naivebayes/Train.java | 148 ----
.../parsefilter/naivebayes/package-info.java | 28 -
src/plugin/parsefilter-regex/build.xml | 27 -
.../data/regex-parsefilter.txt | 10 -
src/plugin/parsefilter-regex/ivy.xml | 37 -
src/plugin/parsefilter-regex/plugin.xml | 42 -
.../parsefilter/regex/RegexParseFilter.java | 199 -----
.../nutch/parsefilter/regex/package-info.java | 23 -
.../parsefilter/regex/TestRegexParseFilter.java | 77 --
src/plugin/plugin.dtd | 206 -----
src/plugin/protocol-file/build.xml | 29 -
src/plugin/protocol-file/ivy.xml | 41 -
src/plugin/protocol-file/plugin.xml | 46 --
.../protocol-file/sample/testprotocolfile.txt | 1 -
.../sample/testprotocolfile_(encoded).txt | 1 -
.../org/apache/nutch/protocol/file/File.java | 228 ------
.../apache/nutch/protocol/file/FileError.java | 36 -
.../nutch/protocol/file/FileException.java | 40 -
.../nutch/protocol/file/FileResponse.java | 317 --------
.../org/apache/nutch/protocol/file/package.html | 5 -
.../nutch/protocol/file/TestProtocolFile.java | 99 ---
src/plugin/protocol-ftp/build.xml | 22 -
src/plugin/protocol-ftp/ivy.xml | 42 -
src/plugin/protocol-ftp/plugin.xml | 46 --
.../org/apache/nutch/protocol/ftp/Client.java | 595 --------------
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 -------
.../org/apache/nutch/protocol/ftp/FtpError.java | 36 -
.../apache/nutch/protocol/ftp/FtpException.java | 46 --
.../ftp/FtpExceptionBadSystResponse.java | 29 -
.../FtpExceptionCanNotHaveDataConnection.java | 29 -
...ExceptionControlClosedByForcedDataClose.java | 30 -
.../ftp/FtpExceptionUnknownForcedDataClose.java | 30 -
.../apache/nutch/protocol/ftp/FtpResponse.java | 521 ------------
.../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 ---
.../protocol/ftp/PrintCommandListener.java | 71 --
.../org/apache/nutch/protocol/ftp/package.html | 5 -
src/plugin/protocol-htmlunit/build.xml | 37 -
src/plugin/protocol-htmlunit/ivy.xml | 38 -
src/plugin/protocol-htmlunit/plugin.xml | 51 --
.../apache/nutch/protocol/htmlunit/Http.java | 63 --
.../nutch/protocol/htmlunit/HttpResponse.java | 573 -------------
.../apache/nutch/protocol/htmlunit/package.html | 21 -
src/plugin/protocol-http/build.xml | 50 --
src/plugin/protocol-http/ivy.xml | 41 -
src/plugin/protocol-http/jsp/basic-http.jsp | 44 -
src/plugin/protocol-http/jsp/brokenpage.jsp | 47 --
src/plugin/protocol-http/jsp/redirect301.jsp | 49 --
src/plugin/protocol-http/jsp/redirect302.jsp | 49 --
src/plugin/protocol-http/plugin.xml | 51 --
.../org/apache/nutch/protocol/http/Http.java | 73 --
.../nutch/protocol/http/HttpResponse.java | 558 -------------
.../org/apache/nutch/protocol/http/package.html | 5 -
.../src/test/conf/nutch-site-test.xml | 52 --
.../nutch/protocol/http/TestProtocolHttp.java | 140 ----
src/plugin/protocol-httpclient/build.xml | 45 --
src/plugin/protocol-httpclient/ivy.xml | 42 -
src/plugin/protocol-httpclient/jsp/basic.jsp | 74 --
src/plugin/protocol-httpclient/jsp/cookies.jsp | 63 --
src/plugin/protocol-httpclient/jsp/digest.jsp | 68 --
src/plugin/protocol-httpclient/jsp/noauth.jsp | 36 -
src/plugin/protocol-httpclient/jsp/ntlm.jsp | 89 ---
src/plugin/protocol-httpclient/plugin.xml | 58 --
.../DummySSLProtocolSocketFactory.java | 163 ----
.../httpclient/DummyX509TrustManager.java | 92 ---
.../apache/nutch/protocol/httpclient/Http.java | 572 -------------
.../protocol/httpclient/HttpAuthentication.java | 45 --
.../httpclient/HttpAuthenticationException.java | 71 --
.../httpclient/HttpAuthenticationFactory.java | 98 ---
.../httpclient/HttpBasicAuthentication.java | 199 -----
.../httpclient/HttpFormAuthConfigurer.java | 106 ---
.../httpclient/HttpFormAuthentication.java | 223 ------
.../nutch/protocol/httpclient/HttpResponse.java | 216 -----
.../nutch/protocol/httpclient/package.html | 9 -
.../src/test/conf/httpclient-auth-test.xml | 58 --
.../src/test/conf/nutch-site-test.xml | 52 --
.../httpclient/TestProtocolHttpClient.java | 217 -----
.../protocol-interactiveselenium/README.md | 38 -
.../protocol-interactiveselenium/build-ivy.xml | 54 --
.../protocol-interactiveselenium/build.xml | 37 -
src/plugin/protocol-interactiveselenium/ivy.xml | 42 -
.../protocol-interactiveselenium/plugin.xml | 47 --
.../protocol/interactiveselenium/Http.java | 59 --
.../interactiveselenium/HttpResponse.java | 399 ----------
.../DefalultMultiInteractionHandler.java | 53 --
.../DefaultClickAllAjaxLinksHandler.java | 88 --
.../handlers/DefaultHandler.java | 30 -
.../handlers/InteractiveSeleniumHandler.java | 25 -
.../protocol/interactiveselenium/package.html | 5 -
src/plugin/protocol-selenium/README.md | 208 -----
src/plugin/protocol-selenium/build-ivy.xml | 54 --
src/plugin/protocol-selenium/build.xml | 36 -
src/plugin/protocol-selenium/ivy.xml | 42 -
src/plugin/protocol-selenium/plugin.xml | 47 --
.../apache/nutch/protocol/selenium/Http.java | 59 --
.../nutch/protocol/selenium/HttpResponse.java | 360 ---------
.../apache/nutch/protocol/selenium/package.html | 5 -
src/plugin/scoring-depth/build.xml | 6 -
src/plugin/scoring-depth/ivy.xml | 41 -
src/plugin/scoring-depth/plugin.xml | 24 -
.../nutch/scoring/depth/DepthScoringFilter.java | 207 -----
.../nutch/scoring/depth/package-info.java | 23 -
src/plugin/scoring-link/build.xml | 27 -
src/plugin/scoring-link/ivy.xml | 41 -
src/plugin/scoring-link/plugin.xml | 39 -
.../scoring/link/LinkAnalysisScoringFilter.java | 95 ---
.../apache/nutch/scoring/link/package-info.java | 23 -
src/plugin/scoring-opic/build.xml | 27 -
src/plugin/scoring-opic/ivy.xml | 41 -
src/plugin/scoring-opic/plugin.xml | 39 -
.../nutch/scoring/opic/OPICScoringFilter.java | 173 ----
.../apache/nutch/scoring/opic/package-info.java | 23 -
src/plugin/scoring-similarity/build-ivy.xml | 54 --
src/plugin/scoring-similarity/build.xml | 27 -
src/plugin/scoring-similarity/ivy.xml | 42 -
src/plugin/scoring-similarity/plugin.xml | 45 --
.../scoring/similarity/SimilarityModel.java | 38 -
.../similarity/SimilarityScoringFilter.java | 70 --
.../similarity/cosine/CosineSimilarity.java | 84 --
.../scoring/similarity/cosine/DocVector.java | 57 --
.../nutch/scoring/similarity/cosine/Model.java | 190 -----
.../scoring/similarity/cosine/package-info.java | 7 -
.../similarity/util/LuceneAnalyzerUtil.java | 93 ---
.../similarity/util/LuceneTokenizer.java | 166 ----
.../scoring/similarity/util/package-info.java | 24 -
src/plugin/subcollection/README.txt | 10 -
src/plugin/subcollection/build.xml | 22 -
src/plugin/subcollection/ivy.xml | 41 -
src/plugin/subcollection/plugin.xml | 41 -
.../nutch/collection/CollectionManager.java | 240 ------
.../apache/nutch/collection/Subcollection.java | 259 ------
.../org/apache/nutch/collection/package.html | 36 -
.../SubcollectionIndexingFilter.java | 101 ---
.../indexer/subcollection/package-info.java | 25 -
.../nutch/collection/TestSubcollection.java | 112 ---
src/plugin/tld/build.xml | 22 -
src/plugin/tld/ivy.xml | 41 -
src/plugin/tld/plugin.xml | 51 --
.../nutch/indexer/tld/TLDIndexingFilter.java | 69 --
.../org/apache/nutch/indexer/tld/package.html | 5 -
.../nutch/scoring/tld/TLDScoringFilter.java | 114 ---
.../org/apache/nutch/scoring/tld/package.html | 5 -
src/plugin/urlfilter-automaton/build.xml | 51 --
src/plugin/urlfilter-automaton/ivy.xml | 42 -
src/plugin/urlfilter-automaton/plugin.xml | 43 -
.../urlfilter-automaton/sample/Benchmarks.rules | 26 -
.../urlfilter-automaton/sample/Benchmarks.urls | 297 -------
.../sample/IntranetCrawling.rules | 24 -
.../sample/IntranetCrawling.urls | 8 -
.../sample/WholeWebCrawling.rules | 19 -
.../sample/WholeWebCrawling.urls | 11 -
.../urlfilter/automaton/AutomatonURLFilter.java | 116 ---
.../nutch/urlfilter/automaton/package.html | 9 -
.../automaton/TestAutomatonURLFilter.java | 56 --
src/plugin/urlfilter-domain/build.xml | 28 -
src/plugin/urlfilter-domain/data/hosts.txt | 5 -
src/plugin/urlfilter-domain/ivy.xml | 41 -
src/plugin/urlfilter-domain/plugin.xml | 43 -
.../nutch/urlfilter/domain/DomainURLFilter.java | 212 -----
.../nutch/urlfilter/domain/package-info.java | 25 -
.../urlfilter/domain/TestDomainURLFilter.java | 67 --
src/plugin/urlfilter-domainblacklist/build.xml | 28 -
.../urlfilter-domainblacklist/data/hosts.txt | 5 -
src/plugin/urlfilter-domainblacklist/ivy.xml | 41 -
src/plugin/urlfilter-domainblacklist/plugin.xml | 43 -
.../DomainBlacklistURLFilter.java | 210 -----
.../urlfilter/domainblacklist/package-info.java | 24 -
.../TestDomainBlacklistURLFilter.java | 49 --
src/plugin/urlfilter-ignoreexempt/README.md | 43 -
src/plugin/urlfilter-ignoreexempt/build.xml | 55 --
.../urlfilter-ignoreexempt/data/.donotdelete | 0
src/plugin/urlfilter-ignoreexempt/ivy.xml | 41 -
src/plugin/urlfilter-ignoreexempt/plugin.xml | 45 --
.../ignoreexempt/ExemptionUrlFilter.java | 101 ---
.../urlfilter/ignoreexempt/package-info.java | 24 -
src/plugin/urlfilter-prefix/build.xml | 22 -
src/plugin/urlfilter-prefix/ivy.xml | 41 -
src/plugin/urlfilter-prefix/plugin.xml | 47 --
.../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 -----
.../apache/nutch/urlfilter/prefix/package.html | 5 -
.../urlfilter/prefix/TestPrefixURLFilter.java | 79 --
src/plugin/urlfilter-regex/build.xml | 51 --
src/plugin/urlfilter-regex/ivy.xml | 41 -
src/plugin/urlfilter-regex/plugin.xml | 48 --
.../urlfilter-regex/sample/Benchmarks.rules | 26 -
.../urlfilter-regex/sample/Benchmarks.urls | 297 -------
.../sample/IntranetCrawling.rules | 27 -
.../sample/IntranetCrawling.urls | 8 -
.../sample/WholeWebCrawling.rules | 22 -
.../sample/WholeWebCrawling.urls | 11 -
.../urlfilter-regex/sample/nutch1838.rules | 12 -
.../urlfilter-regex/sample/nutch1838.urls | 3 -
.../nutch/urlfilter/regex/RegexURLFilter.java | 111 ---
.../apache/nutch/urlfilter/regex/package.html | 5 -
.../urlfilter/regex/TestRegexURLFilter.java | 61 --
src/plugin/urlfilter-suffix/build.xml | 22 -
src/plugin/urlfilter-suffix/ivy.xml | 41 -
src/plugin/urlfilter-suffix/plugin.xml | 47 --
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 --------
.../nutch/urlfilter/suffix/package-info.java | 23 -
.../urlfilter/suffix/TestSuffixURLFilter.java | 123 ---
src/plugin/urlfilter-validator/build.xml | 22 -
src/plugin/urlfilter-validator/ivy.xml | 41 -
src/plugin/urlfilter-validator/plugin.xml | 41 -
.../nutch/urlfilter/validator/UrlValidator.java | 386 ---------
.../nutch/urlfilter/validator/package.html | 9 -
.../urlfilter/validator/TestUrlValidator.java | 79 --
src/plugin/urlmeta/build.xml | 22 -
src/plugin/urlmeta/ivy.xml | 41 -
src/plugin/urlmeta/plugin.xml | 47 --
.../indexer/urlmeta/URLMetaIndexingFilter.java | 118 ---
.../apache/nutch/indexer/urlmeta/package.html | 12 -
.../scoring/urlmeta/URLMetaScoringFilter.java | 175 ----
.../apache/nutch/scoring/urlmeta/package.html | 11 -
src/plugin/urlnormalizer-ajax/build.xml | 22 -
src/plugin/urlnormalizer-ajax/ivy.xml | 41 -
src/plugin/urlnormalizer-ajax/plugin.xml | 41 -
.../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 ------
.../ajax/TestAjaxURLNormalizer.java | 67 --
src/plugin/urlnormalizer-basic/build.xml | 22 -
src/plugin/urlnormalizer-basic/ivy.xml | 41 -
src/plugin/urlnormalizer-basic/plugin.xml | 41 -
.../urlnormalizer/basic/BasicURLNormalizer.java | 290 -------
.../net/urlnormalizer/basic/package-info.java | 23 -
.../basic/TestBasicURLNormalizer.java | 175 ----
src/plugin/urlnormalizer-host/build.xml | 27 -
src/plugin/urlnormalizer-host/data/hosts.txt | 8 -
src/plugin/urlnormalizer-host/ivy.xml | 41 -
src/plugin/urlnormalizer-host/plugin.xml | 43 -
.../urlnormalizer/host/HostURLNormalizer.java | 198 -----
.../net/urlnormalizer/host/package-info.java | 23 -
.../host/TestHostURLNormalizer.java | 57 --
src/plugin/urlnormalizer-pass/build.xml | 22 -
src/plugin/urlnormalizer-pass/ivy.xml | 41 -
src/plugin/urlnormalizer-pass/plugin.xml | 41 -
.../urlnormalizer/pass/PassURLNormalizer.java | 49 --
.../net/urlnormalizer/pass/package-info.java | 23 -
.../pass/TestPassURLNormalizer.java | 45 --
src/plugin/urlnormalizer-protocol/build.xml | 27 -
.../urlnormalizer-protocol/data/protocols.txt | 7 -
src/plugin/urlnormalizer-protocol/ivy.xml | 41 -
src/plugin/urlnormalizer-protocol/plugin.xml | 43 -
.../protocol/ProtocolURLNormalizer.java | 190 -----
.../protocol/TestProtocolURLNormalizer.java | 55 --
src/plugin/urlnormalizer-querystring/build.xml | 22 -
src/plugin/urlnormalizer-querystring/ivy.xml | 41 -
src/plugin/urlnormalizer-querystring/plugin.xml | 42 -
.../querystring/QuerystringURLNormalizer.java | 91 ---
.../urlnormalizer/querystring/package-info.java | 23 -
.../TestQuerystringURLNormalizer.java | 49 --
src/plugin/urlnormalizer-regex/build.xml | 34 -
src/plugin/urlnormalizer-regex/ivy.xml | 41 -
src/plugin/urlnormalizer-regex/plugin.xml | 41 -
.../sample/regex-normalize-default.test | 84 --
.../sample/regex-normalize-default.xml | 66 --
.../sample/regex-normalize-scope1.test | 8 -
.../sample/regex-normalize-scope1.xml | 21 -
.../urlnormalizer/regex/RegexURLNormalizer.java | 324 --------
.../net/urlnormalizer/regex/package-info.java | 23 -
.../regex/TestRegexURLNormalizer.java | 186 -----
src/plugin/urlnormalizer-slash/build.xml | 27 -
src/plugin/urlnormalizer-slash/data/slashes.txt | 7 -
src/plugin/urlnormalizer-slash/ivy.xml | 41 -
src/plugin/urlnormalizer-slash/plugin.xml | 43 -
.../urlnormalizer/slash/SlashURLNormalizer.java | 224 ------
.../slash/TestSlashURLNormalizer.java | 73 --
.../fetch-test-site/dup_of_pagea.html | 11 -
.../fetch-test-site/exception.html | 13 -
src/testresources/fetch-test-site/index.html | 13 -
.../fetch-test-site/nested_spider_trap.html | 23 -
src/testresources/fetch-test-site/pagea.html | 11 -
src/testresources/fetch-test-site/pageb.html | 11 -
src/testresources/fetch-test-site/robots.txt | 0
src/testresources/test-mime-util/test.xlsx | Bin 3950 -> 0 bytes
.../20150309101625/content/part-00000/.data.crc | Bin 124 -> 0 bytes
.../content/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101625/content/part-00000/data | Bin 14452 -> 0 bytes
.../20150309101625/content/part-00000/index | Bin 217 -> 0 bytes
.../crawl_fetch/part-00000/.data.crc | Bin 12 -> 0 bytes
.../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101625/crawl_fetch/part-00000/data | Bin 293 -> 0 bytes
.../20150309101625/crawl_fetch/part-00000/index | Bin 217 -> 0 bytes
.../crawl_generate/.part-00000.crc | Bin 12 -> 0 bytes
.../20150309101625/crawl_generate/part-00000 | Bin 169 -> 0 bytes
.../20150309101625/crawl_parse/.part-00000.crc | Bin 68 -> 0 bytes
.../20150309101625/crawl_parse/part-00000 | Bin 7627 -> 0 bytes
.../parse_data/part-00000/.data.crc | Bin 24 -> 0 bytes
.../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101625/parse_data/part-00000/data | Bin 1985 -> 0 bytes
.../20150309101625/parse_data/part-00000/index | Bin 217 -> 0 bytes
.../parse_text/part-00000/.data.crc | Bin 60 -> 0 bytes
.../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101625/parse_text/part-00000/data | Bin 6554 -> 0 bytes
.../20150309101625/parse_text/part-00000/index | Bin 217 -> 0 bytes
.../20150309101656/content/part-00000/.data.crc | Bin 3372 -> 0 bytes
.../content/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101656/content/part-00000/data | Bin 430250 -> 0 bytes
.../20150309101656/content/part-00000/index | Bin 220 -> 0 bytes
.../crawl_fetch/part-00000/.data.crc | Bin 104 -> 0 bytes
.../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101656/crawl_fetch/part-00000/data | Bin 12121 -> 0 bytes
.../20150309101656/crawl_fetch/part-00000/index | Bin 220 -> 0 bytes
.../crawl_generate/.part-00000.crc | Bin 52 -> 0 bytes
.../20150309101656/crawl_generate/part-00000 | Bin 5590 -> 0 bytes
.../20150309101656/crawl_parse/.part-00000.crc | Bin 1652 -> 0 bytes
.../20150309101656/crawl_parse/part-00000 | Bin 210047 -> 0 bytes
.../parse_data/part-00000/.data.crc | Bin 460 -> 0 bytes
.../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101656/parse_data/part-00000/data | Bin 57355 -> 0 bytes
.../20150309101656/parse_data/part-00000/index | Bin 220 -> 0 bytes
.../parse_text/part-00000/.data.crc | Bin 1260 -> 0 bytes
.../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes
.../20150309101656/parse_text/part-00000/data | Bin 159920 -> 0 bytes
.../20150309101656/parse_text/part-00000/index | Bin 220 -> 0 bytes
1253 files changed, 48889 insertions(+), 46080 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index 5b3c687..7a70f9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,9 @@ build/
runtime/
logs/
/bin/
+
+*.class
+target/
+nutch-core/target
+nutch-plugins/target
+nutch-plugins/*/target
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
new file mode 100644
index 0000000..6444c41
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html
@@ -0,0 +1,11 @@
+<html>
+ <head>
+ <title>page a</title>
+ </head>
+<body>
+This is page a
+<a href="index.html">home</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/exception.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/exception.html b/nutch-core/src/test/resources/fetch-test-site/exception.html
new file mode 100644
index 0000000..e1192a1
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/exception.html
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<HTML>
+<HEAD>
+<TITLE>Exception</TITLE>
+<META http-equiv="Content-Type" content="text/html; charset=unicode">
+</HEAD>
+<BODY>
+!!Trying to parse this one will fail with a MalformedInputException!!
+
+Nutch fetcher test page.
+</BODY>
+</HTML>
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/index.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/test/resources/fetch-test-site/index.html b/nutch-core/src/test/resources/fetch-test-site/index.html
new file mode 100644
index 0000000..d73ff3f
--- /dev/null
+++ b/nutch-core/src/test/resources/fetch-test-site/index.html
@@ -0,0 +1,13 @@
+<html>
+ <head>
+ <title>front page</title>
+ </head>
+<body>
+This is front page.
+<a href="pagea.html">Page a</a>
+<a href="pageb.html">Page b</a>
+<a href="dup_of_pagea.html">dup of Page a</a>
+<hr>
+Nutch fetcher test page
+</body>
+</html>
\ No newline at end of file