You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/09 07:34:37 UTC
svn commit: r1650447 [17/25] - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/
src/java/org/apache/nutch/api/impl/db/
src/java/org/apache/nutch/api/model/response/
src/java/org/apache/nutch/api/resources/ s...
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Jan 9 06:34:33 2015
@@ -36,326 +36,253 @@ import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.*;
-/**
+/**
* Unit tests for DOMContentUtils.
*/
public class TestDOMContentUtils {
- private static final String[] testPages= {
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->"
- + "</body></html>"),
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">"
- + " home </a><!--comment-->"
- + "<style> style </style>"
- + " <a href=\"bot.html\">"
- + " bots </a>"
- + "</body></html>"),
- new String("<html><head><title> </title>"
- + "</head><body> "
- + "<a href=\"/\"> separate this "
- + "<a href=\"ok\"> from this"
- + "</a></a>"
- + "</body></html>"),
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body "
- + "<ul>"
- + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1"
- + "<li> <a href=\"2\"> 2"
- + "</ul>"
- + "</body></html>"),
- // test frameset link extraction. The invalid frame in the middle will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> "
- + "<frame src=\"top.html\">"
- + "</frame>"
- + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">"
- + "<frame src=\"invalid.html\"/>"
- + "</frame>"
- + "<frame src=\"right.html\">"
- + "</frame>"
- + "</frameset>"
- + "</frameset>"
- + "</body></html>"),
- // test <area> and <iframe> link extraction + url normalization
- new String("<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>"
- + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>"
- + "</body></html>"),
- // test whitespace processing for plain text extraction
- new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ."
- + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\";x\">anchor1</a>"
- + "<a href=\"g;x\">anchor2</a>"
- + "<a href=\"g;x?y#s\">anchor3</a>"
- + "</body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\"g\">anchor1</a>"
- + "<a href=\"g?y#s\">anchor2</a>"
- + "<a href=\"?y=1\">anchor3</a>"
- + "<a href=\"?y=1#s\">anchor4</a>"
- + "<a href=\"?y=1;somethingelse\">anchor5</a>"
- + "</body></html>"),
- };
-
+ private static final String[] testPages = {
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+ + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+ + "</a></a>" + "</body></html>"),
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+ + "</body></html>"),
+ // test frameset link extraction. The invalid frame in the middle will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+ + "</frame>" + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+ + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+ + "</frameset>" + "</frameset>" + "</body></html>"),
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ." + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+
private static int SKIP = 9;
- private static String[] testBaseHrefs= {
- "http://www.nutch.org",
- "http://www.nutch.org/docs/foo.html",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/frames/",
- "http://www.nutch.org/maps/",
- "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//",
- "http://www.nutch.org/",
- "http://www.nutch.org/",
- "http://www.nutch.org/",
- "http://www.nutch.org/;something"
- };
-
- private static final DocumentFragment testDOMs[]=
- new DocumentFragment[testPages.length];
-
- private static URL[] testBaseHrefURLs=
- new URL[testPages.length];
-
-
- private static final String[] answerText= {
- "title body anchor",
- "title body home bots",
- "separate this from this",
- "my title body home 1 2",
- "my title",
- "my title the bottom",
- "my title Whitespace test whitespace test "
- + "This is a whitespace test . Newlines should appear as space too. "
- + "Tabs are spaces too. This is a break -> and the line after break . "
- + "one two three space here space there no space "
- + "one two two three three four put some text here and there. "
- + "End this madness ! . . . .",
- "ignore ignore",
- "test1 test2",
- "test1 test2",
- "title anchor1 anchor2 anchor3",
- "title anchor1 anchor2 anchor3 anchor4 anchor5"
- };
-
- private static final String[] answerTitle= {
- "title",
- "title",
- "",
- "my title",
- "my title",
- "my title",
- "my title",
- "",
- "",
- "",
- "title",
- "title"
- };
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something" };
+
+ private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ "test1 test2", "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "", "title",
+ "title" };
// note: should be in page-order
private static Outlink[][] answerOutlinks;
-
+
private static Configuration conf;
private static DOMContentUtils utils = null;
-
+
@Before
public void setup() {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
- DOMFragmentParser parser= new DOMFragmentParser();
+ DOMFragmentParser parser = new DOMFragmentParser();
try {
- parser.setFeature(
- "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
- true);
- } catch (SAXException e) {}
- for (int i= 0; i < testPages.length; i++) {
- DocumentFragment node=
- new HTMLDocumentImpl().createDocumentFragment();
- try {
- parser.parse(
- new InputSource(
- new ByteArrayInputStream(testPages[i].getBytes()) ),
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ } catch (SAXException e) {
+ }
+ for (int i = 0; i < testPages.length; i++) {
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ try {
+ parser.parse(
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
node);
- testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
- } catch (Exception e) {
- assertTrue("caught exception: " + e, false);
- }
- testDOMs[i]= node;
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ assertTrue("caught exception: " + e, false);
+ }
+ testDOMs[i] = node;
}
try {
- answerOutlinks = new Outlink[][]{
- {
- new Outlink("http://www.nutch.org", "anchor"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
- },
- {
- new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"),
- },
- {
- new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/index.html", "whitespace test"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/;x", "anchor1"),
- new Outlink("http://www.nutch.org/g;x", "anchor2"),
- new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
- },
- {
- // this is tricky - see RFC3986 section 5.4.1 example 7
- new Outlink("http://www.nutch.org/g", "anchor1"),
- new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
- new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
- new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
- }
- };
+ answerOutlinks = new Outlink[][] {
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+ { new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ { new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""), },
+ { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""), },
+ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+ {},
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ {},
+ { new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+ "anchor5") } };
} catch (MalformedURLException e) {
-
- }
+
+ }
}
private static boolean equalsIgnoreWhitespace(String s1, String s2) {
- StringTokenizer st1= new StringTokenizer(s1);
- StringTokenizer st2= new StringTokenizer(s2);
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
while (st1.hasMoreTokens()) {
- if (!st2.hasMoreTokens())
+ if (!st2.hasMoreTokens())
return false;
- if ( ! st1.nextToken().equals(st2.nextToken()) )
+ if (!st1.nextToken().equals(st2.nextToken()))
return false;
}
- if (st2.hasMoreTokens())
+ if (st2.hasMoreTokens())
return false;
return true;
}
@Test
public void testGetText() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuilder sb= new StringBuilder();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuilder sb = new StringBuilder();
utils.getText(sb, testDOMs[i]);
- String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerText[i], text));
+ String text = sb.toString();
+ assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerText[i], text));
}
}
@Test
public void testGetTitle() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuilder sb= new StringBuilder();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuilder sb = new StringBuilder();
utils.getTitle(sb, testDOMs[i]);
- String text= sb.toString();
- assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
- equalsIgnoreWhitespace(answerTitle[i], text));
+ String text = sb.toString();
+ assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
}
}
@Test
public void testGetOutlinks() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- ArrayList<Outlink> outlinks= new ArrayList<Outlink>();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
if (i == SKIP) {
conf.setBoolean("parser.html.form.use_action", false);
utils.setConf(conf);
@@ -364,52 +291,48 @@ public class TestDOMContentUtils {
utils.setConf(conf);
}
utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
- Outlink[] outlinkArr= new Outlink[outlinks.size()];
- outlinkArr= outlinks.toArray(outlinkArr);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = outlinks.toArray(outlinkArr);
compareOutlinks(answerOutlinks[i], outlinkArr);
}
}
private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
- for (int i= 0; i < o.length; i++) {
+ for (int i = 0; i < o.length; i++) {
sb.append(o[i].toString());
sb.append(System.getProperty("line.separator"));
}
}
private static final String outlinksString(Outlink[] o) {
- StringBuffer sb= new StringBuffer();
+ StringBuffer sb = new StringBuffer();
appendOutlinks(sb, o);
return sb.toString();
}
private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
if (o1.length != o2.length) {
- assertTrue("got wrong number of outlinks (expecting " + o1.length
- + ", got " + o2.length + ")"
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1)
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2)
- + System.getProperty("line.separator"),
- false
- );
+ assertTrue(
+ "got wrong number of outlinks (expecting " + o1.length + ", got "
+ + o2.length + ")" + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1) + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2) + System.getProperty("line.separator"),
+ false);
}
- for (int i= 0; i < o1.length; i++) {
+ for (int i = 0; i < o1.length; i++) {
if (!o1[i].equals(o2[i])) {
- assertTrue("got wrong outlinks at position " + i
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'"
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'",
- false
- );
-
+ assertTrue(
+ "got wrong outlinks at position " + i
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+ + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+ + "', anchor: '" + o2[i].getAnchor() + "'", false);
+
}
}
}
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Fri Jan 9 06:34:33 2015
@@ -36,71 +36,55 @@ import static org.junit.Assert.*;
public class TestHtmlParser {
- public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(TestHtmlParser.class);
+
+ private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά";
+ private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>";
+ private static final String encodingTestContent = "<title>"
+ + encodingTestKeywords + "</title>\n"
+ + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
+ + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+ private static String[][] encodingTestPages = {
+ {
+ "HTML4, utf-8, meta http-equiv, no quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+ + encodingTestContent },
+ {
+ "HTML4, utf-8, meta http-equiv, single quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+ + encodingTestContent },
+ {
+ "XHTML, utf-8, meta http-equiv, double quotes",
+ "utf-8",
+ "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+ + encodingTestContent },
+ {
+ "HTML5, utf-8, meta charset",
+ "utf-8",
+ "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+ + encodingTestContent },
+ { "HTML5, utf-8, BOM", "utf-8",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+ { "HTML5, utf-16, BOM", "utf-16",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
- private static final String encodingTestKeywords =
- "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά";
- private static final String encodingTestBody =
- "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>";
- private static final String encodingTestContent =
- "<title>" + encodingTestKeywords + "</title>\n"
- + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n"
- + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
- private static String[][] encodingTestPages= {
- {
- "HTML4, utf-8, meta http-equiv, no quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
- + encodingTestContent
- },
- {
- "HTML4, utf-8, meta http-equiv, single quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
- + encodingTestContent
- },
- {
- "XHTML, utf-8, meta http-equiv, double quotes",
- "utf-8",
- "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
- + "<html>\n<head>\n"
- + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
- + encodingTestContent
- },
- {
- "HTML5, utf-8, meta charset",
- "utf-8",
- "<!DOCTYPE html>\n<html>\n<head>\n"
- + "<meta charset=\"utf-8\">"
- + encodingTestContent
- },
- {
- "HTML5, utf-8, BOM",
- "utf-8",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
- + encodingTestContent
- },
- {
- "HTML5, utf-16, BOM",
- "utf-16",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
- + encodingTestContent
- }
- };
-
private Configuration conf;
private Parser parser;
-
+
private static final String dummyUrl = "http://dummy.url/";
-
@Before
public void setup() {
conf = NutchConfiguration.create();
@@ -115,25 +99,25 @@ public class TestHtmlParser {
page.setContentType(new Utf8("text/html"));
return page;
}
-
+
protected Parse parse(WebPage page) {
return parser.getParse(dummyUrl, page);
}
-
@Test
public void testEncodingDetection() {
for (String[] testPage : encodingTestPages) {
String name = testPage[0];
Charset charset = Charset.forName(testPage[1]);
byte[] contentBytes = testPage[2].getBytes(charset);
- //Parse parse = parse(contentBytes);
+ // Parse parse = parse(contentBytes);
WebPage page = page(contentBytes);
Parse parse = parse(page);
String text = parse.getText();
String title = parse.getTitle();
- //String keywords = parse.getMeta("keywords");
- String keywords = Bytes.toString(page.getMetadata().get(new Utf8("keywords")));
+ // String keywords = parse.getMeta("keywords");
+ String keywords = Bytes.toString(page.getMetadata().get(
+ new Utf8("keywords")));
LOG.info(name);
LOG.info("title:\t" + title);
LOG.info("keywords:\t" + keywords);
Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Fri Jan 9 06:34:33 2015
@@ -34,120 +34,96 @@ import static org.junit.Assert.*;
public class TestRobotsMetaProcessor {
/*
-
- some sample tags:
-
- <meta name="robots" content="index,follow">
- <meta name="robots" content="noindex,follow">
- <meta name="robots" content="index,nofollow">
- <meta name="robots" content="noindex,nofollow">
-
- <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-
- */
-
-
- public static String[] tests=
- {
- "<html><head><title>test page</title>"
- + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
- + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"all\"> "
- + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
- + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"none\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,follow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,follow\"> "
- + "<base href=\"http://www.nutch.org/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\"> "
- + "<base href=\"http://www.nutch.org/base/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
+ *
+ * some sample tags:
+ *
+ * <meta name="robots" content="index,follow"> <meta name="robots"
+ * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+ * <meta name="robots" content="noindex,nofollow">
+ *
+ * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+ */
+
+ public static String[] tests = {
+ "<html><head><title>test page</title>"
+ + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+ + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"all\"> "
+ + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+ + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,follow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,follow\"> "
+ + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+ + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+ + " some text" + "</body></html>",
};
- public static final boolean[][] answers= {
- {true, true, true}, // NONE
- {false, false, true}, // all
- {true, true, true}, // nOnE
- {true, true, false}, // none
- {true, true, false}, // noindex,nofollow
- {true, false, false}, // noindex,follow
- {false, true, false}, // index,nofollow
- {false, false, false}, // index,follow
- {false, false, false}, // missing!
+ public static final boolean[][] answers = { { true, true, true }, // NONE
+ { false, false, true }, // all
+ { true, true, true }, // nOnE
+ { true, true, false }, // none
+ { true, true, false }, // noindex,nofollow
+ { true, false, false }, // noindex,follow
+ { false, true, false }, // index,nofollow
+ { false, false, false }, // index,follow
+ { false, false, false }, // missing!
};
private URL[][] currURLsAndAnswers;
@Test
public void testRobotsMetaProcessor() {
- DOMFragmentParser parser= new DOMFragmentParser();;
+ DOMFragmentParser parser = new DOMFragmentParser();
+ ;
- try {
- currURLsAndAnswers= new URL[][] {
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org/foo/"),
- new URL("http://www.nutch.org/")},
- {new URL("http://www.nutch.org"),
- new URL("http://www.nutch.org/base/")}
- };
+ try {
+ currURLsAndAnswers = new URL[][] {
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org/foo/"),
+ new URL("http://www.nutch.org/") },
+ { new URL("http://www.nutch.org"),
+ new URL("http://www.nutch.org/base/") } };
} catch (Exception e) {
assertTrue("couldn't make test URLs!", false);
}
- for (int i= 0; i < tests.length; i++) {
- byte[] bytes= tests[i].getBytes();
+ for (int i = 0; i < tests.length; i++) {
+ byte[] bytes = tests[i].getBytes();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
@@ -157,24 +133,22 @@ public class TestRobotsMetaProcessor {
e.printStackTrace();
}
- HTMLMetaTags robotsMeta= new HTMLMetaTags();
- HTMLMetaProcessor.getMetaTags(robotsMeta, node,
- currURLsAndAnswers[i][0]);
+ HTMLMetaTags robotsMeta = new HTMLMetaTags();
+ HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
assertTrue("got index wrong on test " + i,
- robotsMeta.getNoIndex() == answers[i][0]);
+ robotsMeta.getNoIndex() == answers[i][0]);
assertTrue("got follow wrong on test " + i,
- robotsMeta.getNoFollow() == answers[i][1]);
+ robotsMeta.getNoFollow() == answers[i][1]);
assertTrue("got cache wrong on test " + i,
- robotsMeta.getNoCache() == answers[i][2]);
- assertTrue("got base href wrong on test " + i + " (got "
- + robotsMeta.getBaseHref() + ")",
- ( (robotsMeta.getBaseHref() == null)
- && (currURLsAndAnswers[i][1] == null) )
- || ( (robotsMeta.getBaseHref() != null)
- && robotsMeta.getBaseHref().equals(
- currURLsAndAnswers[i][1]) ) );
-
+ robotsMeta.getNoCache() == answers[i][2]);
+ assertTrue(
+ "got base href wrong on test " + i + " (got "
+ + robotsMeta.getBaseHref() + ")",
+ ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+ || ((robotsMeta.getBaseHref() != null) && robotsMeta
+ .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
}
}
Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Fri Jan 9 06:34:33 2015
@@ -56,11 +56,10 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
- * by Stephan Strittmatter.
- *
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter.
+ *
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class JSParseFilter implements ParseFilter, Parser {
@@ -72,11 +71,17 @@ public class JSParseFilter implements Pa
/**
* Scan the JavaScript looking for possible {@link Outlink}'s
- * @param url URL of the {@link WebPage} to be parsed
- * @param page {@link WebPage} object relative to the URL
- * @param parse {@link Parse} object holding parse status
- * @param metatags within the {@link NutchDocument}
- * @param doc The {@link NutchDocument} object
+ *
+ * @param url
+ * URL of the {@link WebPage} to be parsed
+ * @param page
+ * {@link WebPage} object relative to the URL
+ * @param parse
+ * {@link Parse} object holding parse status
+ * @param metatags
+ * within the {@link NutchDocument}
+ * @param doc
+ * The {@link NutchDocument} object
* @return parse the actual {@link Parse} object
*/
@Override
@@ -98,28 +103,34 @@ public class JSParseFilter implements Pa
return parse;
}
- private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
+ private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+ List<Outlink> outlinks) {
if (n instanceof Element) {
String name = n.getNodeName();
if (name.equalsIgnoreCase("script")) {
@SuppressWarnings("unused")
String lang = null;
Node lNode = n.getAttributes().getNamedItem("language");
- if (lNode == null) lang = "javascript";
- else lang = lNode.getNodeValue();
+ if (lNode == null)
+ lang = "javascript";
+ else
+ lang = lNode.getNodeValue();
StringBuffer script = new StringBuffer();
NodeList nn = n.getChildNodes();
if (nn.getLength() > 0) {
for (int i = 0; i < nn.getLength(); i++) {
- if (i > 0) script.append('\n');
+ if (i > 0)
+ script.append('\n');
script.append(nn.item(i).getNodeValue());
}
// This logging makes the output very messy.
- //if (LOG.isInfoEnabled()) {
- // LOG.info("script: language=" + lang + ", text: " + script.toString());
- //}
+ // if (LOG.isInfoEnabled()) {
+ // LOG.info("script: language=" + lang + ", text: " +
+ // script.toString());
+ // }
Outlink[] links = getJSLinks(script.toString(), "", base);
- if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ if (links != null && links.length > 0)
+ outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
return;
}
@@ -131,7 +142,8 @@ public class JSParseFilter implements Pa
// Window: onload,onunload
// Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
// Keyboard: onkeydown,onkeypress,onkeyup
- // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+ // Mouse:
+ // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
Node anode = attrs.item(i);
Outlink[] links = null;
if (anode.getNodeName().startsWith("on")) {
@@ -142,7 +154,8 @@ public class JSParseFilter implements Pa
links = getJSLinks(val, "", base);
}
}
- if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ if (links != null && links.length > 0)
+ outlinks.addAll(Arrays.asList(links));
}
}
}
@@ -154,42 +167,51 @@ public class JSParseFilter implements Pa
/**
* Set the {@link Configuration} object
- * @param url URL of the {@link WebPage} which is parsed
- * @param page {@link WebPage} object relative to the URL
+ *
+ * @param url
+ * URL of the {@link WebPage} which is parsed
+ * @param page
+ * {@link WebPage} object relative to the URL
* @return parse the actual {@link Parse} object
*/
@Override
public Parse getParse(String url, WebPage page) {
String type = TableUtil.toString(page.getContentType());
- if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
- return ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT,
- "Content not JavaScript: '" + type + "'", getConf());
+ if (type != null && !type.trim().equals("")
+ && !type.toLowerCase().startsWith("application/x-javascript"))
+ return ParseStatusUtils.getEmptyParse(
+ ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
+ + type + "'", getConf());
String script = Bytes.toString(page.getContent());
Outlink[] outlinks = getJSLinks(script, "", url);
- if (outlinks == null) outlinks = new Outlink[0];
+ if (outlinks == null)
+ outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
- if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+ if (idx > MAX_TITLE_LEN)
+ idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
- Parse parse =
- new Parse(script, title, outlinks, ParseStatusUtils.STATUS_SUCCESS);
+ Parse parse = new Parse(script, title, outlinks,
+ ParseStatusUtils.STATUS_SUCCESS);
return parse;
}
private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
// A simple pattern. This allows also invalid URL characters.
private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
// Alternative pattern, which limits valid url characters.
- //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+ // private static final String URI_PATTERN =
+ // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
/**
- * This method extracts URLs from literals embedded in JavaScript.
+ * This method extracts URLs from literals embedded in JavaScript.
*/
private Outlink[] getJSLinks(String plainText, String anchor, String base) {
@@ -199,8 +221,8 @@ public class JSParseFilter implements Pa
try {
baseURL = new URL(base);
} catch (Exception e) {
- if (LOG.isErrorEnabled()) {
- LOG.error("error assigning base URL", e);
+ if (LOG.isErrorEnabled()) {
+ LOG.error("error assigning base URL", e);
}
}
@@ -208,10 +230,10 @@ public class JSParseFilter implements Pa
final PatternCompiler cp = new Perl5Compiler();
final Pattern pattern = cp.compile(STRING_PATTERN,
Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
+ | Perl5Compiler.MULTILINE_MASK);
final Pattern pattern1 = cp.compile(URI_PATTERN,
Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
+ | Perl5Compiler.MULTILINE_MASK);
final PatternMatcher matcher = new Perl5Matcher();
final PatternMatcher matcher1 = new Perl5Matcher();
@@ -220,28 +242,28 @@ public class JSParseFilter implements Pa
MatchResult result;
String url;
- //loop the matches
+ // loop the matches
while (matcher.contains(input, pattern)) {
result = matcher.getMatch();
url = result.group(2);
PatternMatcherInput input1 = new PatternMatcherInput(url);
if (!matcher1.matches(input1, pattern1)) {
- if (LOG.isTraceEnabled()) {
- LOG.trace(" - invalid '" + url + "'");
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(" - invalid '" + url + "'");
}
continue;
}
if (url.startsWith("www.")) {
url = "http://" + url;
} else {
- // See if candidate URL is parseable. If not, pass and move on to
+ // See if candidate URL is parseable. If not, pass and move on to
// the next match.
try {
url = new URL(baseURL, url).toString();
} catch (MalformedURLException ex) {
if (LOG.isTraceEnabled()) {
- LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
- baseURL + "'", ex);
+ LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+ + baseURL + "'", ex);
}
continue;
}
@@ -255,14 +277,14 @@ public class JSParseFilter implements Pa
} catch (Exception ex) {
// if it is a malformed URL we just throw it away and continue with
// extraction.
- if (LOG.isErrorEnabled()) {
- LOG.error(" - invalid or malformed URL", ex);
+ if (LOG.isErrorEnabled()) {
+ LOG.error(" - invalid or malformed URL", ex);
}
}
final Outlink[] retval;
- //create array of the Outlinks
+ // create array of the Outlinks
if (outlinks != null && outlinks.size() > 0) {
retval = outlinks.toArray(new Outlink[0]);
} else {
@@ -273,8 +295,10 @@ public class JSParseFilter implements Pa
}
/**
- * Main method which can be run from command line with the plugin option.
- * The method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js baseURL
+ * Main method which can be run from command line with the plugin option. The
+ * method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js
+ * baseURL
+ *
* @param args
* @throws Exception
*/
@@ -287,7 +311,8 @@ public class JSParseFilter implements Pa
BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuffer sb = new StringBuffer();
String line = null;
- while ((line = br.readLine()) != null) sb.append(line + "\n");
+ while ((line = br.readLine()) != null)
+ sb.append(line + "\n");
JSParseFilter parseFilter = new JSParseFilter();
parseFilter.setConf(NutchConfiguration.create());
Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
@@ -311,10 +336,9 @@ public class JSParseFilter implements Pa
}
/**
- * Gets all the fields for a given {@link WebPage}
- * Many datastores need to setup the mapreduce job by specifying the fields
- * needed. All extensions that work on WebPage are able to specify what fields
- * they need.
+ * Gets all the fields for a given {@link WebPage} Many datastores need to
+ * setup the mapreduce job by specifying the fields needed. All extensions
+ * that work on WebPage are able to specify what fields they need.
*/
@Override
public Collection<WebPage.Field> getFields() {
Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java Fri Jan 9 06:34:33 2015
@@ -20,3 +20,4 @@
* from JavaScript files and embedded JavaScript code snippets.
*/
package org.apache.nutch.parse.js;
+
Modified: nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Fri Jan 9 06:34:33 2015
@@ -38,9 +38,9 @@ import java.nio.ByteBuffer;
import static org.junit.Assert.assertEquals;
/**
- * JUnit test case for {@link JSParseFilter} which tests
- * 1. That 5 outlinks are extracted from JavaScript snippets embedded in HTML
- * 2. That X outlinks are extracted from a pure JavaScript file (this is temporarily disabled)
+ * JUnit test case for {@link JSParseFilter} which tests 1. That 5 outlinks are
+ * extracted from JavaScript snippets embedded in HTML 2. That X outlinks are
+ * extracted from a pure JavaScript file (this is temporarily disabled)
*
* @author lewismc
*/
@@ -54,47 +54,53 @@ public class TestJSParseFilter {
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-js/build.xml during plugin compilation.
- private String[] sampleFiles = { "parse_pure_js_test.js", "parse_embedded_js_test.html" };
-
+ private String[] sampleFiles = { "parse_pure_js_test.js",
+ "parse_embedded_js_test.html" };
+
private Configuration conf;
-
+
@Before
public void setUp() {
conf = NutchConfiguration.create();
conf.set("file.content.limit", "-1");
}
- public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException {
+ public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException,
+ ParseException, IOException {
String urlString;
Parse parse;
-
+
urlString = "file:" + sampleDir + fileSeparator + sampleFiles;
File file = new File(urlString);
byte[] bytes = new byte[(int) file.length()];
DataInputStream dip = new DataInputStream(new FileInputStream(file));
dip.readFully(bytes);
dip.close();
-
+
WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
MimeUtil mutil = new MimeUtil(conf);
String mime = mutil.getMimeType(file);
page.setContentType(new Utf8(mime));
-
+
parse = new ParseUtil(conf).parse(urlString, page);
return parse.getOutlinks();
}
-
+
@Test
- public void testOutlinkExtraction() throws ProtocolException, ParseException, IOException {
+ public void testOutlinkExtraction() throws ProtocolException, ParseException,
+ IOException {
String[] filenames = new File(sampleDir).list();
for (int i = 0; i < filenames.length; i++) {
if (filenames[i].endsWith(".js") == true) {
- assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles));
- // temporarily disabled as a suitable pure JS file could not be be found.
- //} else {
- //assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles));
+ assertEquals("number of outlinks in .js test file should be 5", 5,
+ getOutlinks(sampleFiles));
+ // temporarily disabled as a suitable pure JS file could not be be
+ // found.
+ // } else {
+ // assertEquals("number of outlinks in .html file should be X", 5,
+ // getOutlinks(sampleFiles));
}
}
}
Modified: nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java (original)
+++ nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java Fri Jan 9 06:34:33 2015
@@ -21,3 +21,4 @@
* (see {@link org.apache.nutch.indexer.metadata}).
*/
package org.apache.nutch.parse.metatags;
+
Modified: nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java Fri Jan 9 06:34:33 2015
@@ -59,14 +59,15 @@ public class TestMetaTagsParser {
/**
*
- *
+ *
* @param fileName
* This variable set test file.
* @param useUtil
* If value is True method use ParseUtil
* @return If successfully document parsed, it return metatags
*/
- public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName, boolean useUtil) {
+ public Map<CharSequence, ByteBuffer> parseMetaTags(String fileName,
+ boolean useUtil) {
try {
Configuration conf = NutchConfiguration.create();
String urlString = "file:" + sampleDir + fileSeparator + fileName;