You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/03/29 01:54:41 UTC
svn commit: r1582928 [1/4] - in /nutch/trunk: ./ ivy/
src/plugin/creativecommons/src/test/org/creativecommons/nutch/
src/plugin/feed/src/test/org/apache/nutch/parse/feed/
src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/
src/plugin/inde...
Author: lewismc
Date: Sat Mar 29 00:54:40 2014
New Revision: 1582928
URL: http://svn.apache.org/r1582928
Log:
NUTCH-1737 Upgrade to recent JUnit 4.x
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/pom.xml
nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestSignatureFactory.java
nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java
nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
nutch/trunk/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
nutch/trunk/src/test/org/apache/nutch/util/TestEncodingDetector.java
nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java
nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java
nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java
nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Mar 29 00:54:40 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1737 Upgrade to recent JUnit 4.x (lewismc)
+
* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
* NUTCH-1671 indexchecker to add digest field (snagel, lufeng)
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Sat Mar 29 00:54:40 2014
@@ -924,6 +924,8 @@
<source path="${basedir}/src/plugin/index-basic/src/java/" />
<source path="${basedir}/src/plugin/index-basic/src/test/" />
<source path="${basedir}/src/plugin/indexer-solr/src/java/" />
+ <source path="${basedir}/src/plugin/indexer-elastic/src/java/" />
+ <source path="${basedir}/src/plugin/indexer-dummy/src/java/" />
<source path="${basedir}/src/plugin/index-metadata/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/test/" />
@@ -984,6 +986,8 @@
<source path="${basedir}/src/plugin/urlnormalizer-pass/src/test/" />
<source path="${basedir}/src/plugin/urlnormalizer-regex/src/java/" />
<source path="${basedir}/src/plugin/urlnormalizer-regex/src/test/" />
+ <source path="${basedir}/src/plugin/urlnormalizer-querystring/src/java/" />
+ <source path="${basedir}/src/plugin/urlnormalizer-querystring/src/test/" />
<output path="${basedir}/build/classes" />
</classpath>
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Sat Mar 29 00:54:40 2014
@@ -80,7 +80,7 @@
<!--Configuration: test -->
<!--artifacts needed for testing -->
- <dependency org="junit" name="junit" rev="3.8.1" conf="*->default" />
+ <dependency org="junit" name="junit" rev="4.11" conf="*->default" />
<dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0"
conf="test->default" />
Modified: nutch/trunk/pom.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/pom.xml?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/pom.xml (original)
+++ nutch/trunk/pom.xml Sat Mar 29 00:54:40 2014
@@ -221,7 +221,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>3.8.1</version>
+ <version>4.11</version>
<optional>true</optional>
</dependency>
<dependency>
Modified: nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Sat Mar 29 00:54:40 2014
@@ -23,18 +23,17 @@ import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
-import java.util.Properties;
import java.io.*;
-import java.net.URL;
-import junit.framework.TestCase;
-
-public class TestCCParseFilter extends TestCase {
+public class TestCCParseFilter {
private static final File testDir =
new File(System.getProperty("test.input"));
+ @Test
public void testPages() throws Exception {
pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
"http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
@@ -69,9 +68,9 @@ public class TestCCParseFilter extends T
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
- assertEquals(license, metadata.get("License-Url"));
- assertEquals(location, metadata.get("License-Location"));
- assertEquals(type, metadata.get("Work-Type"));
+ Assert.assertEquals(license, metadata.get("License-Url"));
+ Assert.assertEquals(location, metadata.get("License-Location"));
+ Assert.assertEquals(type, metadata.get("Work-Type"));
}
}
Modified: nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (original)
+++ nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Sat Mar 29 00:54:40 2014
@@ -21,6 +21,8 @@ package org.apache.nutch.parse.feed;
import java.util.Iterator;
import java.util.Map;
+import org.junit.Assert;
+import org.junit.Test;
// APACHE imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,9 +39,6 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.util.NutchConfiguration;
-// Junit imports
-import junit.framework.TestCase;
-
/**
*
* @author mattmann
@@ -47,7 +46,7 @@ import junit.framework.TestCase;
* Test Suite for the {@link FeedParser}.
*
*/
-public class TestFeedParser extends TestCase {
+public class TestFeedParser {
private String fileSeparator = System.getProperty("file.separator");
@@ -63,16 +62,6 @@ public class TestFeedParser extends Test
.getName());
/**
- * Default Constructor.
- *
- * @param name
- * The name of this {@link TestCase}.
- */
- public TestFeedParser(String name) {
- super(name);
- }
-
- /**
* Calls the {@link FeedParser} on a sample RSS file and checks that there are
* 3 {@link ParseResult} entries including the below 2 links:
* <ul>
@@ -87,6 +76,7 @@ public class TestFeedParser extends Test
* @throws ParseException
* If the {@link Parser}Layer cannot be loaded.
*/
+ @Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
String urlString;
Protocol protocol;
@@ -104,7 +94,7 @@ public class TestFeedParser extends Test
parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
- assertEquals(3, parseResult.size());
+ Assert.assertEquals(3, parseResult.size());
boolean hasLink1 = false, hasLink2 = false, hasLink3=false;
@@ -121,12 +111,12 @@ public class TestFeedParser extends Test
hasLink3 = true;
}
- assertNotNull(entry.getValue());
- assertNotNull(entry.getValue().getData());
+ Assert.assertNotNull(entry.getValue());
+ Assert.assertNotNull(entry.getValue().getData());
}
if (!hasLink1 || !hasLink2 || !hasLink3) {
- fail("Outlinks read from sample rss file are not correct!");
+ Assert.fail("Outlinks read from sample rss file are not correct!");
}
}
Modified: nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Sat Mar 29 00:54:40 2014
@@ -16,8 +16,6 @@
*/
package org.apache.nutch.indexer.anchor;
-import junit.framework.TestCase;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -27,6 +25,8 @@ import org.apache.nutch.indexer.NutchDoc
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
/**
* JUnit test case which tests
@@ -36,14 +36,15 @@ import org.apache.nutch.util.NutchConfig
* @author lewismc
*
*/
-public class TestAnchorIndexingFilter extends TestCase {
+public class TestAnchorIndexingFilter {
+ @Test
public void testDeduplicateAnchor() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("anchorIndexingFilter.deduplicate", true);
AnchorIndexingFilter filter = new AnchorIndexingFilter();
filter.setConf(conf);
- assertNotNull(filter);
+ Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
Inlinks inlinks = new Inlinks();
@@ -54,11 +55,11 @@ public class TestAnchorIndexingFilter ex
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
} catch(Exception e){
e.printStackTrace();
- fail(e.getMessage());
+ Assert.fail(e.getMessage());
}
- assertNotNull(doc);
- assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
- assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
+ Assert.assertNotNull(doc);
+ Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
+ Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
}
Modified: nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Sat Mar 29 00:54:40 2014
@@ -28,9 +28,10 @@ import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
import java.util.Date;
-import junit.framework.TestCase;
/**
* JUnit test case which tests
@@ -43,8 +44,9 @@ import junit.framework.TestCase;
*
*/
-public class TestBasicIndexingFilter extends TestCase {
+public class TestBasicIndexingFilter {
+ @Test
public void testBasicIndexingFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("indexer.max.title.length", 10);
@@ -53,7 +55,7 @@ public class TestBasicIndexingFilter ext
BasicIndexingFilter filter = new BasicIndexingFilter();
filter.setConf(conf);
- assertNotNull(filter);
+ Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
@@ -73,15 +75,15 @@ public class TestBasicIndexingFilter ext
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
} catch(Exception e){
e.printStackTrace();
- fail(e.getMessage());
+ Assert.fail(e.getMessage());
}
- assertNotNull(doc);
- assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
- assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
- assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
- assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html",
+ Assert.assertNotNull(doc);
+ Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
+ Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
+ Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
+ Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html",
doc.getField("url").getValues().get(0));
- assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
- assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
+ Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
+ Assert.assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
}
}
Modified: nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Sat Mar 29 00:54:40 2014
@@ -29,17 +29,19 @@ import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
-import junit.framework.TestCase;
-
-public class TestMoreIndexingFilter extends TestCase {
+public class TestMoreIndexingFilter {
+ @Test
public void testContentType() throws IndexingException {
Configuration conf = NutchConfiguration.create();
assertContentType(conf, "text/html", "text/html");
assertContentType(conf, "text/html; charset=UTF-8", "text/html");
}
+ @Test
public void testGetParts() {
String[] parts = MoreIndexingFilter.getParts("text/html");
assertParts(parts, 2, "text", "html");
@@ -48,12 +50,13 @@ public class TestMoreIndexingFilter exte
/**
* @since NUTCH-901
*/
+ @Test
public void testNoParts(){
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
- assertNotNull(filter);
+ Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
@@ -62,14 +65,15 @@ public class TestMoreIndexingFilter exte
}
catch(Exception e){
e.printStackTrace();
- fail(e.getMessage());
+ Assert.fail(e.getMessage());
}
- assertNotNull(doc);
- assertTrue(doc.getFieldNames().contains("type"));
- assertEquals(1, doc.getField("type").getValues().size());
- assertEquals("text/html", doc.getFieldValue("type"));
+ Assert.assertNotNull(doc);
+ Assert.assertTrue(doc.getFieldNames().contains("type"));
+ Assert.assertEquals(1, doc.getField("type").getValues().size());
+ Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
+ @Test
public void testContentDispositionTitle() throws IndexingException {
Configuration conf = NutchConfiguration.create();
@@ -82,13 +86,13 @@ public class TestMoreIndexingFilter exte
new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
"http://www.example.com/"), new CrawlDatum(), new Inlinks());
- assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
+ Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
}
private void assertParts(String[] parts, int count, String... expected) {
- assertEquals(count, parts.length);
+ Assert.assertEquals(count, parts.length);
for (int i = 0; i < expected.length; i++) {
- assertEquals(expected[i], parts[i]);
+ Assert.assertEquals(expected[i], parts[i]);
}
}
@@ -100,6 +104,6 @@ public class TestMoreIndexingFilter exte
NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
"http://www.example.com/"), new CrawlDatum(), new Inlinks());
- assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
+ Assert.assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
}
Modified: nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java (original)
+++ nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Sat Mar 29 00:54:40 2014
@@ -23,8 +23,9 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.util.NutchConfiguration;
-
-import junit.framework.TestCase;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
/**
* JUnit test case which tests
@@ -36,7 +37,7 @@ import junit.framework.TestCase;
* @author tejasp
*/
-public class TestStaticFieldIndexerTest extends TestCase {
+public class TestStaticFieldIndexerTest {
Configuration conf;
@@ -46,7 +47,8 @@ public class TestStaticFieldIndexerTest
Text url;
StaticFieldIndexer filter;
- protected void setUp() throws Exception {
+ @Before
+ public void setUp() throws Exception {
conf = NutchConfiguration.create();
parse = new ParseImpl();
url = new Text("http://nutch.apache.org/index.html");
@@ -59,9 +61,10 @@ public class TestStaticFieldIndexerTest
* Test that empty {@code index.static} does not add anything to the document
* @throws Exception
*/
+ @Test
public void testEmptyIndexStatic() throws Exception {
- assertNotNull(filter);
+ Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
@@ -70,22 +73,23 @@ public class TestStaticFieldIndexerTest
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
- fail(e.getMessage());
+ Assert.fail(e.getMessage());
}
- assertNotNull(doc);
- assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
+ Assert.assertNotNull(doc);
+ Assert.assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
}
/**
* Test that valid field:value pairs are added to the document
* @throws Exception
*/
+ @Test
public void testNormalScenario() throws Exception {
conf.set("index.static",
"field1:val1, field2 : val2 val3 , field3, field4 :val4 , ");
- assertNotNull(filter);
+ Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
@@ -94,17 +98,17 @@ public class TestStaticFieldIndexerTest
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
- fail(e.getMessage());
+ Assert.fail(e.getMessage());
}
- assertNotNull(doc);
- assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
- assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
- assertTrue("test if doc has field1", doc.getField("field1").getValues()
+ Assert.assertNotNull(doc);
+ Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
+ Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
+ Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues()
.contains("val1"));
- assertTrue("test if doc has field2", doc.getField("field2").getValues()
+ Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues()
.contains("val2"));
- assertTrue("test if doc has field4", doc.getField("field4").getValues()
+ Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues()
.contains("val4"));
}
}
Modified: nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)
+++ nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Sat Mar 29 00:54:40 2014
@@ -16,13 +16,10 @@
*/
package org.apache.nutch.analysis.lang;
-
-
-// JUnit imports
import java.io.BufferedReader;
import java.io.InputStreamReader;
-import junit.framework.TestCase;
+
// Nutch imports
import org.apache.nutch.metadata.Metadata;
@@ -31,9 +28,11 @@ import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.tika.language.LanguageIdentifier;
+import org.junit.Assert;
+import org.junit.Test;
-public class TestHTMLLanguageParser extends TestCase {
+public class TestHTMLLanguageParser {
private static String URL = "http://foo.bar/";
@@ -50,6 +49,7 @@ public class TestHTMLLanguageParser exte
/**
* Test parsing of language identifiers from html
**/
+ @Test
public void testMetaHTMLParsing() {
try {
@@ -58,16 +58,17 @@ public class TestHTMLLanguageParser exte
for (int t = 0; t < docs.length; t++) {
Content content = getContent(docs[t]);
Parse parse = parser.parse(content).get(content.getUrl());
- assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
+ Assert.assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE));
}
} catch (Exception e) {
e.printStackTrace(System.out);
- fail(e.toString());
+ Assert.fail(e.toString());
}
}
/** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
+ @Test
public void testParseLanguage() {
String tests[][] = {
{ "(SCHEME=ISO.639-1) sv", "sv" },
@@ -117,7 +118,7 @@ public class TestHTMLLanguageParser exte
};
for (int i=0; i<44; i++) {
- assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+ Assert.assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
}
}
@@ -129,6 +130,7 @@ public class TestHTMLLanguageParser exte
}
+ @Test
public void testLanguageIndentifier() {
try {
long total = 0;
@@ -150,7 +152,7 @@ public class TestHTMLLanguageParser exte
if (testLine.length() > 256) {
identifier = new LanguageIdentifier(testLine);
lang = identifier.getLanguage();
- assertEquals(tokens[1], lang);
+ Assert.assertEquals(tokens[1], lang);
}
}
testFile.close();
@@ -162,14 +164,14 @@ public class TestHTMLLanguageParser exte
lang = identifier.getLanguage();
System.out.println(lang);
total += System.currentTimeMillis() - start;
- assertEquals(tokens[1], lang);
+ Assert.assertEquals(tokens[1], lang);
}
}
in.close();
System.out.println("Total Time=" + total);
} catch (Exception e) {
e.printStackTrace();
- fail(e.toString());
+ Assert.fail(e.toString());
}
}
Modified: nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Sat Mar 29 00:54:40 2014
@@ -17,8 +17,10 @@
package org.apache.nutch.protocol.http.api;
+import org.junit.Assert;
+import org.junit.Test;
+
import crawlercommons.robots.BaseRobotRules;
-import junit.framework.TestCase;
/**
* JUnit test case which tests
@@ -26,14 +28,14 @@ import junit.framework.TestCase;
* 2. that crawl delay is extracted correctly from the robots file
*
*/
-public class TestRobotRulesParser extends TestCase {
+public class TestRobotRulesParser {
private static final String CONTENT_TYPE = "text/plain";
private static final String SINGLE_AGENT = "Agent1";
private static final String MULTIPLE_AGENTS = "Agent2, Agent1";
private static final String UNKNOWN_AGENT = "AgentABC";
private static final String CR = "\r";
-
+
private static final String ROBOTS_STRING =
"User-Agent: Agent1 #foo" + CR
+ "Disallow: /a" + CR
@@ -50,7 +52,7 @@ public class TestRobotRulesParser extend
+ "" + CR
+ "User-Agent: *" + CR
+ "Disallow: /foo/bar/" + CR; // no crawl delay for other agents
-
+
private static final String[] TEST_PATHS = new String[] {
"http://example.com/a",
"http://example.com/a/bloh/foo.html",
@@ -72,20 +74,22 @@ public class TestRobotRulesParser extend
private HttpRobotRulesParser parser;
private BaseRobotRules rules;
+ public TestRobotRulesParser () {
+ }
public TestRobotRulesParser(String name) {
- super(name);
parser = new HttpRobotRulesParser();
}
/**
- * Test that the robots rules are interpreted correctly by the robots rules parser.
- */
+ * Test that the robots rules are interpreted correctly by the robots rules parser.
+ */
+ @Test
public void testRobotsAgent() {
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
- + "path " + TEST_PATHS[counter]
+ Assert.assertTrue("testing on agent (" + SINGLE_AGENT + "), and "
+ + "path " + TEST_PATHS[counter]
+ " got " + rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
@@ -93,24 +97,25 @@ public class TestRobotRulesParser extend
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS);
for(int counter = 0; counter < TEST_PATHS.length; counter++) {
- assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
- + "path " + TEST_PATHS[counter]
+ Assert.assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and "
+ + "path " + TEST_PATHS[counter]
+ " got " + rules.isAllowed(TEST_PATHS[counter]),
rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
}
/**
- * Test that the crawl delay is extracted from the robots file for respective agent.
- * If its not specified for a given agent, default value must be returned.
- */
+ * Test that the crawl delay is extracted from the robots file for respective agent.
+ * If its not specified for a given agent, default value must be returned.
+ */
+ @Test
public void testCrawlDelay() {
// for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);
- assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000));
-
+ Assert.assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000));
+
// for UNKNOWN_AGENT, the default crawl delay must be returned.
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);
- assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE));
+ Assert.assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE));
}
}
Modified: nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)
+++ nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Sat Mar 29 00:54:40 2014
@@ -24,10 +24,7 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
-// JUnit imports
-import junit.framework.TestCase;
-
-// Commons Logging imports
+import org.junit.Assert;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -40,30 +37,26 @@ import org.apache.nutch.net.URLFilter;
*
* @author Jérôme Charron
*/
-public abstract class RegexURLFilterBaseTest extends TestCase {
-
+public abstract class RegexURLFilterBaseTest {
+
/** My logger */
protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class);
private final static String SEPARATOR = System.getProperty("file.separator");
private final static String SAMPLES = System.getProperty("test.data", ".");
-
- public RegexURLFilterBaseTest(String testName) {
- super(testName);
- }
-
+
protected abstract URLFilter getURLFilter(Reader rules);
protected void bench(int loops, String file) {
try {
bench(loops,
- new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
} catch (Exception e) {
- fail(e.toString());
+ Assert.fail(e.toString());
}
}
-
+
protected void bench(int loops, Reader rules, Reader urls) {
long start = System.currentTimeMillis();
try {
@@ -73,40 +66,40 @@ public abstract class RegexURLFilterBase
test(filter, expected);
}
} catch (Exception e) {
- fail(e.toString());
+ Assert.fail(e.toString());
}
LOG.info("bench time (" + loops + ") " +
- (System.currentTimeMillis()-start) + "ms");
+ (System.currentTimeMillis()-start) + "ms");
}
-
+
protected void test(String file) {
try {
test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
} catch (Exception e) {
- fail(e.toString());
+ Assert.fail(e.toString());
}
}
-
+
protected void test(Reader rules, Reader urls) {
try {
test(getURLFilter(rules), readURLFile(urls));
} catch (Exception e) {
- fail(e.toString());
+ Assert.fail(e.toString());
}
}
-
+
protected void test(URLFilter filter, FilteredURL[] expected) {
for (int i=0; i<expected.length; i++) {
String result = filter.filter(expected[i].url);
if (result != null) {
- assertTrue(expected[i].url, expected[i].sign);
+ Assert.assertTrue(expected[i].url, expected[i].sign);
} else {
- assertFalse(expected[i].url, expected[i].sign);
+ Assert.assertFalse(expected[i].url, expected[i].sign);
}
}
}
-
+
private static FilteredURL[] readURLFile(Reader reader) throws IOException {
BufferedReader in = new BufferedReader(reader);
List<FilteredURL> list = new ArrayList<FilteredURL>();
@@ -118,9 +111,9 @@ public abstract class RegexURLFilterBase
}
return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]);
}
-
+
private static class FilteredURL {
-
+
boolean sign;
String url;
@@ -138,5 +131,5 @@ public abstract class RegexURLFilterBase
url = line.substring(1);
}
}
-
+
}
Modified: nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Sat Mar 29 00:54:40 2014
@@ -21,18 +21,17 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolException;
-
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
-
-import junit.framework.TestCase;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
@@ -49,7 +48,7 @@ import java.io.IOException;
*
* @author John Xing
*/
-public class TestExtParser extends TestCase {
+public class TestExtParser {
private File tempFile = null;
private String urlString = null;
private Content content = null;
@@ -59,10 +58,7 @@ public class TestExtParser extends TestC
// echo -n "nutch rocks nutch rocks nutch rocks" | md5sum
private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";
- public TestExtParser(String name) {
- super(name);
- }
-
+ @Before
protected void setUp() throws ProtocolException, IOException {
// prepare a temp file with expectedText as its content
// This system property is defined in ./src/plugin/build-plugin.xml
@@ -76,7 +72,7 @@ public class TestExtParser extends TestC
// otherwise in java.io.tmpdir
tempFile = File.createTempFile("nutch.test.plugin.ExtParser.",".txt");
}
- urlString = tempFile.toURL().toString();
+ urlString = tempFile.toURI().toURL().toString();
FileOutputStream fos = new FileOutputStream(tempFile);
fos.write(expectedText.getBytes());
@@ -88,6 +84,7 @@ public class TestExtParser extends TestC
protocol = null;
}
+ @After
protected void tearDown() {
// clean content
content = null;
@@ -97,6 +94,7 @@ public class TestExtParser extends TestC
// tempFile.delete();
}
+ @Test
public void testIt() throws ParseException {
String contentType;
@@ -114,13 +112,13 @@ public class TestExtParser extends TestC
contentType = "application/vnd.nutch.example.cat";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
- assertEquals(expectedText,parse.getText());
+ Assert.assertEquals(expectedText,parse.getText());
// check external parser that does 'md5sum'
contentType = "application/vnd.nutch.example.md5sum";
content.setContentType(contentType);
parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get(content.getUrl());
- assertTrue(parse.getText().startsWith(expectedMD5sum));
+ Assert.assertTrue(parse.getText().startsWith(expectedMD5sum));
}
}
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Sat Mar 29 00:54:40 2014
@@ -17,8 +17,6 @@
package org.apache.nutch.parse.html;
-import junit.framework.TestCase;
-
import org.apache.nutch.parse.Outlink;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
@@ -30,6 +28,9 @@ import java.util.ArrayList;
import java.util.StringTokenizer;
import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.html.dom.*;
@@ -37,125 +38,125 @@ import org.apache.html.dom.*;
/**
* Unit tests for DOMContentUtils.
*/
-public class TestDOMContentUtils extends TestCase {
+public class TestDOMContentUtils {
private static final String[] testPages= {
new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->"
- + "</body></html>"),
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">"
- + " home </a><!--comment-->"
- + "<style> style </style>"
- + " <a href=\"bot.html\">"
- + " bots </a>"
- + "</body></html>"),
- new String("<html><head><title> </title>"
- + "</head><body> "
- + "<a href=\"/\"> separate this "
- + "<a href=\"ok\"> from this"
- + "</a></a>"
- + "</body></html>"),
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body "
- + "<ul>"
- + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1"
- + "<li> <a href=\"2\"> 2"
- + "</ul>"
- + "</body></html>"),
- // test frameset link extraction. The invalid frame in the middle will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> "
- + "<frame src=\"top.html\">"
- + "</frame>"
- + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">"
- + "<frame src=\"invalid.html\"/>"
- + "</frame>"
- + "<frame src=\"right.html\">"
- + "</frame>"
- + "</frameset>"
- + "</frameset>"
- + "</body></html>"),
- // test <area> and <iframe> link extraction + url normalization
- new String("<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>"
- + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>"
- + "</body></html>"),
- // test whitespace processing for plain text extraction
- new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ."
- + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\";x\">anchor1</a>"
- + "<a href=\"g;x\">anchor2</a>"
- + "<a href=\"g;x?y#s\">anchor3</a>"
- + "</body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\"g\">anchor1</a>"
- + "<a href=\"g?y#s\">anchor2</a>"
- + "<a href=\"?y=1\">anchor3</a>"
- + "<a href=\"?y=1#s\">anchor4</a>"
- + "<a href=\"?y=1;somethingelse\">anchor5</a>"
- + "</body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\"g\"><!--no anchor--></a>"
- + "<a href=\"g1\"> <!--whitespace--> </a>"
- + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
- + "</body></html>"),
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->"
+ + "</body></html>"),
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">"
+ + " home </a><!--comment-->"
+ + "<style> style </style>"
+ + " <a href=\"bot.html\">"
+ + " bots </a>"
+ + "</body></html>"),
+ new String("<html><head><title> </title>"
+ + "</head><body> "
+ + "<a href=\"/\"> separate this "
+ + "<a href=\"ok\"> from this"
+ + "</a></a>"
+ + "</body></html>"),
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body "
+ + "<ul>"
+ + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1"
+ + "<li> <a href=\"2\"> 2"
+ + "</ul>"
+ + "</body></html>"),
+ // test frameset link extraction. The invalid frame in the middle will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> "
+ + "<frame src=\"top.html\">"
+ + "</frame>"
+ + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">"
+ + "<frame src=\"invalid.html\"/>"
+ + "</frame>"
+ + "<frame src=\"right.html\">"
+ + "</frame>"
+ + "</frameset>"
+ + "</frameset>"
+ + "</body></html>"),
+ // test <area> and <iframe> link extraction + url normalization
+ new String("<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>"
+ + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>"
+ + "</body></html>"),
+ // test whitespace processing for plain text extraction
+ new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ."
+ + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\";x\">anchor1</a>"
+ + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>"
+ + "</body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\"g\">anchor1</a>"
+ + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>"
+ + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+ + "</body></html>"),
+ new String("<html><head><title> title </title>"
+ + "</head><body>"
+ + "<a href=\"g\"><!--no anchor--></a>"
+ + "<a href=\"g1\"> <!--whitespace--> </a>"
+ + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
+ + "</body></html>"),
};
-
+
private static int SKIP = 9;
private static String[] testBaseHrefs= {
@@ -173,12 +174,12 @@ public class TestDOMContentUtils extends
"http://www.nutch.org/;something",
"http://www.nutch.org/"
};
-
+
private static final DocumentFragment testDOMs[]=
- new DocumentFragment[testPages.length];
+ new DocumentFragment[testPages.length];
private static URL[] testBaseHrefURLs=
- new URL[testPages.length];
+ new URL[testPages.length];
private static final String[] answerText= {
@@ -194,12 +195,12 @@ public class TestDOMContentUtils extends
+ "one two three space here space there no space "
+ "one two two three three four put some text here and there. "
+ "End this madness ! . . . .",
- "ignore ignore",
- "test1 test2",
- "test1 test2",
- "title anchor1 anchor2 anchor3",
- "title anchor1 anchor2 anchor3 anchor4 anchor5",
- "title"
+ "ignore ignore",
+ "test1 test2",
+ "test1 test2",
+ "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5",
+ "title"
};
private static final String[] answerTitle= {
@@ -220,15 +221,12 @@ public class TestDOMContentUtils extends
// note: should be in page-order
private static Outlink[][] answerOutlinks;
-
+
private static Configuration conf;
private static DOMContentUtils utils = null;
-
- public TestDOMContentUtils(String name) {
- super(name);
- }
- private static void setup() {
+ @Before
+ public void setup() {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
@@ -239,83 +237,83 @@ public class TestDOMContentUtils extends
true);
} catch (SAXException e) {}
for (int i= 0; i < testPages.length; i++) {
- DocumentFragment node=
+ DocumentFragment node=
new HTMLDocumentImpl().createDocumentFragment();
- try {
- parser.parse(
+ try {
+ parser.parse(
new InputSource(
- new ByteArrayInputStream(testPages[i].getBytes()) ),
- node);
- testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
- } catch (Exception e) {
- assertTrue("caught exception: " + e, false);
- }
+ new ByteArrayInputStream(testPages[i].getBytes()) ),
+ node);
+ testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ Assert.assertTrue("caught exception: " + e, false);
+ }
testDOMs[i]= node;
}
try {
- answerOutlinks = new Outlink[][]{
- {
- new Outlink("http://www.nutch.org", "anchor"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
- },
- {
- new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"),
- },
- {
- new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/index.html", "whitespace test"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/;x", "anchor1"),
- new Outlink("http://www.nutch.org/g;x", "anchor2"),
- new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
- },
- {
- // this is tricky - see RFC3986 section 5.4.1 example 7
- new Outlink("http://www.nutch.org/g", "anchor1"),
- new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
- new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
- new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
- },
- {
- new Outlink("http://www.nutch.org/g", ""),
- new Outlink("http://www.nutch.org/g1", ""),
- new Outlink("http://www.nutch.org/g2", "bla bla"),
- new Outlink("http://www.nutch.org/test.gif", "bla bla"),
- }
+ answerOutlinks = new Outlink[][]{
+ {
+ new Outlink("http://www.nutch.org", "anchor"),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"),
+ },
+ {
+ new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"),
+ },
+ {
+ new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""),
+ },
+ {
+ new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""),
+ },
+ {
+ new Outlink("http://www.nutch.org/index.html", "whitespace test"),
+ },
+ {
+ },
+ {
+ new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
+ },
+ {
+ },
+ {
+ new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
+ },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
+ },
+ {
+ new Outlink("http://www.nutch.org/g", ""),
+ new Outlink("http://www.nutch.org/g1", ""),
+ new Outlink("http://www.nutch.org/g2", "bla bla"),
+ new Outlink("http://www.nutch.org/test.gif", "bla bla"),
+ }
};
-
+
} catch (MalformedURLException e) {
-
+
}
}
@@ -334,6 +332,7 @@ public class TestDOMContentUtils extends
return true;
}
+ @Test
public void testGetText() {
if (testDOMs[0] == null)
setup();
@@ -341,14 +340,15 @@ public class TestDOMContentUtils extends
StringBuffer sb= new StringBuffer();
utils.getText(sb, testDOMs[i]);
String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerText[i], text));
+ Assert.assertTrue("expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got text: "+ text,
+ equalsIgnoreWhitespace(answerText[i], text));
}
}
+ @Test
public void testGetTitle() {
if (testDOMs[0] == null)
setup();
@@ -356,14 +356,15 @@ public class TestDOMContentUtils extends
StringBuffer sb= new StringBuffer();
utils.getTitle(sb, testDOMs[i]);
String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerTitle[i], text));
+ Assert.assertTrue("expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got text: "+ text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
}
}
+ @Test
public void testGetOutlinks() {
if (testDOMs[0] == null)
setup();
@@ -398,31 +399,31 @@ public class TestDOMContentUtils extends
private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
if (o1.length != o2.length) {
- assertTrue("got wrong number of outlinks (expecting " + o1.length
- + ", got " + o2.length + ")"
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1)
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2)
- + System.getProperty("line.separator"),
- false
- );
+ Assert.assertTrue("got wrong number of outlinks (expecting " + o1.length
+ + ", got " + o2.length + ")"
+ + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1)
+ + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2)
+ + System.getProperty("line.separator"),
+ false
+ );
}
for (int i= 0; i < o1.length; i++) {
if (!o1[i].equals(o2[i])) {
- assertTrue("got wrong outlinks at position " + i
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'"
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'",
- false
- );
-
+ Assert.assertTrue("got wrong outlinks at position " + i
+ + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'",
+ false
+ );
+
}
}
}
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Sat Mar 29 00:54:40 2014
@@ -21,16 +21,17 @@ import java.nio.charset.Charset;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.html.HtmlParser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import junit.framework.TestCase;
-
-public class TestHtmlParser extends TestCase {
+public class TestHtmlParser {
public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
@@ -90,12 +91,11 @@ public class TestHtmlParser extends Test
+ encodingTestContent
}
};
-
+
private Configuration conf;
private Parser parser;
-
- public TestHtmlParser(String name) {
- super(name);
+
+ public TestHtmlParser() {
conf = NutchConfiguration.create();
parser = new HtmlParser();
parser.setConf(conf);
@@ -107,7 +107,8 @@ public class TestHtmlParser extends Test
new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(),
conf)).get(dummyUrl);
}
-
+
+ @Test
public void testEncodingDetection() {
for (String[] testPage : encodingTestPages) {
String name = testPage[0];
@@ -121,14 +122,14 @@ public class TestHtmlParser extends Test
LOG.info("title:\t" + title);
LOG.info("keywords:\t" + keywords);
LOG.info("text:\t" + text);
- assertEquals("Title not extracted properly (" + name + ")",
+ Assert.assertEquals("Title not extracted properly (" + name + ")",
encodingTestKeywords, title);
for (String keyword : encodingTestKeywords.split(",\\s*")) {
- assertTrue(keyword + " not found in text (" + name + ")",
+ Assert.assertTrue(keyword + " not found in text (" + name + ")",
text.contains(keyword));
}
if (keywords != null) {
- assertEquals("Keywords not extracted properly (" + name + ")",
+ Assert.assertEquals("Keywords not extracted properly (" + name + ")",
encodingTestKeywords, keywords);
}
}
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Sat Mar 29 00:54:40 2014
@@ -17,23 +17,20 @@
package org.apache.nutch.parse.html;
-import junit.framework.TestCase;
-
import org.apache.nutch.parse.HTMLMetaTags;
import java.io.ByteArrayInputStream;
import java.net.URL;
import org.cyberneko.html.parsers.*;
+import org.junit.Assert;
+import org.junit.Test;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.html.dom.*;
/** Unit tests for HTMLMetaProcessor. */
-public class TestRobotsMetaProcessor extends TestCase {
- public TestRobotsMetaProcessor(String name) {
- super(name);
- }
+public class TestRobotsMetaProcessor {
/*
@@ -46,71 +43,71 @@ public class TestRobotsMetaProcessor ext
<META HTTP-EQUIV="Pragma" CONTENT="no-cache">
- */
+ */
public static String[] tests=
- {
- "<html><head><title>test page</title>"
- + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
- + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"all\"> "
- + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
- + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"none\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
+ {
"<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
+ + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+ + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"all\"> "
+ + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+ + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"none\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,follow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,nofollow\"> "
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,follow\"> "
+ + "<base href=\"http://www.nutch.org/\">"
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\"> "
+ + "<base href=\"http://www.nutch.org/base/\">"
+ + "</head><body>"
+ + " some text"
+ + "</body></html>",
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,follow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,follow\"> "
- + "<base href=\"http://www.nutch.org/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\"> "
- + "<base href=\"http://www.nutch.org/base/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- };
+ };
public static final boolean[][] answers= {
{true, true, true}, // NONE
@@ -126,25 +123,26 @@ public class TestRobotsMetaProcessor ext
private URL[][] currURLsAndAnswers;
+ @Test
public void testRobotsMetaProcessor() {
DOMFragmentParser parser= new DOMFragmentParser();;
try {
currURLsAndAnswers= new URL[][] {
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org/foo/"),
- new URL("http://www.nutch.org/")},
- {new URL("http://www.nutch.org"),
- new URL("http://www.nutch.org/base/")}
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org"), null},
+ {new URL("http://www.nutch.org/foo/"),
+ new URL("http://www.nutch.org/")},
+ {new URL("http://www.nutch.org"),
+ new URL("http://www.nutch.org/base/")}
};
} catch (Exception e) {
- assertTrue("couldn't make test URLs!", false);
+ Assert.assertTrue("couldn't make test URLs!", false);
}
for (int i= 0; i < tests.length; i++) {
@@ -160,22 +158,22 @@ public class TestRobotsMetaProcessor ext
HTMLMetaTags robotsMeta= new HTMLMetaTags();
HTMLMetaProcessor.getMetaTags(robotsMeta, node,
- currURLsAndAnswers[i][0]);
+ currURLsAndAnswers[i][0]);
+
+ Assert.assertTrue("got index wrong on test " + i,
+ robotsMeta.getNoIndex() == answers[i][0]);
+ Assert.assertTrue("got follow wrong on test " + i,
+ robotsMeta.getNoFollow() == answers[i][1]);
+ Assert.assertTrue("got cache wrong on test " + i,
+ robotsMeta.getNoCache() == answers[i][2]);
+ Assert.assertTrue("got base href wrong on test " + i + " (got "
+ + robotsMeta.getBaseHref() + ")",
+ ( (robotsMeta.getBaseHref() == null)
+ && (currURLsAndAnswers[i][1] == null) )
+ || ( (robotsMeta.getBaseHref() != null)
+ && robotsMeta.getBaseHref().equals(
+ currURLsAndAnswers[i][1]) ) );
- assertTrue("got index wrong on test " + i,
- robotsMeta.getNoIndex() == answers[i][0]);
- assertTrue("got follow wrong on test " + i,
- robotsMeta.getNoFollow() == answers[i][1]);
- assertTrue("got cache wrong on test " + i,
- robotsMeta.getNoCache() == answers[i][2]);
- assertTrue("got base href wrong on test " + i + " (got "
- + robotsMeta.getBaseHref() + ")",
- ( (robotsMeta.getBaseHref() == null)
- && (currURLsAndAnswers[i][1] == null) )
- || ( (robotsMeta.getBaseHref() != null)
- && robotsMeta.getBaseHref().equals(
- currURLsAndAnswers[i][1]) ) );
-
}
}
Modified: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java?rev=1582928&r1=1582927&r2=1582928&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java Sat Mar 29 00:54:40 2014
@@ -20,8 +20,6 @@ package org.apache.nutch.parse.html;
import java.util.Set;
import java.util.TreeSet;
-import junit.framework.TestCase;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -32,8 +30,10 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
-public class TestMetatagParser extends TestCase {
+public class TestMetatagParser {
private String fileSeparator = System.getProperty("file.separator");
private String sampleDir = System.getProperty("test.data", ".");
@@ -42,10 +42,6 @@ public class TestMetatagParser extends T
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
- public TestMetatagParser(String name) {
- super(name);
- }
-
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
@@ -57,21 +53,23 @@ public class TestMetatagParser extends T
metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
- fail(e.toString());
+ Assert.fail(e.toString());
}
return metadata;
}
+ @Test
public void testIt() {
Configuration conf = NutchConfiguration.create();
// check that we get the same values
Metadata parseMeta= parseMeta(sampleFile, conf);
- assertEquals(description, parseMeta.get("metatag.description"));
- assertEquals(keywords, parseMeta.get("metatag.keywords"));
+ Assert.assertEquals(description, parseMeta.get("metatag.description"));
+ Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
}
+ @Test
public void testMultiValueMetatags() {
Configuration conf = NutchConfiguration.create();
conf.set("metatags.names", "keywords;DC.creator");
@@ -87,7 +85,7 @@ public class TestMetatagParser extends T
}
String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
for (String val : expectedValues1) {
- assertTrue(failMessage + val, valueSet.contains(val));
+ Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
valueSet.clear();
@@ -96,7 +94,7 @@ public class TestMetatagParser extends T
}
String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
for (String val : expectedValues2) {
- assertTrue(failMessage + val, valueSet.contains(val));
+ Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
}