You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [17/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Thu Jan 29 05:38:59 2015
@@ -35,189 +35,133 @@ import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.html.dom.*;
-/**
+/**
* Unit tests for DOMContentUtils.
*/
public class TestDOMContentUtils {
- private static final String[] testPages= {
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->"
- + "</body></html>"),
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">"
- + " home </a><!--comment-->"
- + "<style> style </style>"
- + " <a href=\"bot.html\">"
- + " bots </a>"
- + "</body></html>"),
- new String("<html><head><title> </title>"
- + "</head><body> "
- + "<a href=\"/\"> separate this "
- + "<a href=\"ok\"> from this"
- + "</a></a>"
- + "</body></html>"),
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body "
- + "<ul>"
- + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1"
- + "<li> <a href=\"2\"> 2"
- + "</ul>"
- + "</body></html>"),
- // test frameset link extraction. The invalid frame in the middle will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> "
- + "<frame src=\"top.html\">"
- + "</frame>"
- + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">"
- + "<frame src=\"invalid.html\"/>"
- + "</frame>"
- + "<frame src=\"right.html\">"
- + "</frame>"
- + "</frameset>"
- + "</frameset>"
- + "</body></html>"),
- // test <area> and <iframe> link extraction + url normalization
- new String("<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>"
- + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>"
- + "</body></html>"),
- // test whitespace processing for plain text extraction
- new String("<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ."
- + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\";x\">anchor1</a>"
- + "<a href=\"g;x\">anchor2</a>"
- + "<a href=\"g;x?y#s\">anchor3</a>"
- + "</body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\"g\">anchor1</a>"
- + "<a href=\"g?y#s\">anchor2</a>"
- + "<a href=\"?y=1\">anchor3</a>"
- + "<a href=\"?y=1#s\">anchor4</a>"
- + "<a href=\"?y=1;somethingelse\">anchor5</a>"
- + "</body></html>"),
- new String("<html><head><title> title </title>"
- + "</head><body>"
- + "<a href=\"g\"><!--no anchor--></a>"
- + "<a href=\"g1\"> <!--whitespace--> </a>"
- + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
- + "</body></html>"),
- };
+ private static final String[] testPages = {
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+ + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+ + "</a></a>" + "</body></html>"),
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+ + "</body></html>"),
+ // test frameset link extraction. The invalid frame in the middle will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+ + "</frame>" + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+ + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+ + "</frameset>" + "</frameset>" + "</body></html>"),
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ." + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\"><!--no anchor--></a>"
+ + "<a href=\"g1\"> <!--whitespace--> </a>"
+ + "<a href=\"g2\"> <img src=test.gif alt='bla bla'> </a>"
+ + "</body></html>"), };
private static int SKIP = 9;
- private static String[] testBaseHrefs= {
- "http://www.nutch.org",
- "http://www.nutch.org/docs/foo.html",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/docs/",
- "http://www.nutch.org/frames/",
- "http://www.nutch.org/maps/",
- "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//",
- "http://www.nutch.org/",
- "http://www.nutch.org/",
- "http://www.nutch.org/",
- "http://www.nutch.org/;something",
- "http://www.nutch.org/"
- };
-
- private static final DocumentFragment testDOMs[]=
- new DocumentFragment[testPages.length];
-
- private static URL[] testBaseHrefURLs=
- new URL[testPages.length];
-
-
- private static final String[] answerText= {
- "title body anchor",
- "title body home bots",
- "separate this from this",
- "my title body home 1 2",
- "my title",
- "my title the bottom",
- "my title Whitespace test whitespace test "
- + "This is a whitespace test . Newlines should appear as space too. "
- + "Tabs are spaces too. This is a break -> and the line after break . "
- + "one two three space here space there no space "
- + "one two two three three four put some text here and there. "
- + "End this madness ! . . . .",
- "ignore ignore",
- "test1 test2",
- "test1 test2",
- "title anchor1 anchor2 anchor3",
- "title anchor1 anchor2 anchor3 anchor4 anchor5",
- "title"
- };
-
- private static final String[] answerTitle= {
- "title",
- "title",
- "",
- "my title",
- "my title",
- "my title",
- "my title",
- "",
- "",
- "",
- "title",
- "title",
- "title"
- };
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something", "http://www.nutch.org/" };
+
+ private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ "test1 test2", "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "", "title",
+ "title", "title" };
// note: should be in page-order
private static Outlink[][] answerOutlinks;
@@ -230,87 +174,64 @@ public class TestDOMContentUtils {
conf = NutchConfiguration.create();
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
- DOMFragmentParser parser= new DOMFragmentParser();
+ DOMFragmentParser parser = new DOMFragmentParser();
try {
- parser.setFeature(
- "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
- true);
- } catch (SAXException e) {}
- for (int i= 0; i < testPages.length; i++) {
- DocumentFragment node=
- new HTMLDocumentImpl().createDocumentFragment();
+ parser
+ .setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ } catch (SAXException e) {
+ }
+ for (int i = 0; i < testPages.length; i++) {
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(
- new InputSource(
- new ByteArrayInputStream(testPages[i].getBytes()) ),
- node);
- testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+ node);
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
Assert.assertTrue("caught exception: " + e, false);
- }
- testDOMs[i]= node;
+ }
+ testDOMs[i] = node;
}
try {
- answerOutlinks = new Outlink[][]{
- {
- new Outlink("http://www.nutch.org", "anchor"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
- },
- {
- new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"),
- },
- {
- new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"),
- },
- {
- new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""),
- },
- {
- new Outlink("http://www.nutch.org/index.html", "whitespace test"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
- },
- {
- },
- {
- new Outlink("http://www.nutch.org/;x", "anchor1"),
- new Outlink("http://www.nutch.org/g;x", "anchor2"),
- new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
- },
- {
- // this is tricky - see RFC3986 section 5.4.1 example 7
- new Outlink("http://www.nutch.org/g", "anchor1"),
- new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
- new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
- new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
- },
- {
- new Outlink("http://www.nutch.org/g", ""),
- new Outlink("http://www.nutch.org/g1", ""),
- new Outlink("http://www.nutch.org/g2", "bla bla"),
- new Outlink("http://www.nutch.org/test.gif", "bla bla"),
- }
- };
+ answerOutlinks = new Outlink[][] {
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+ { new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ { new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""), },
+ { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""), },
+ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+ {},
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ {},
+ { new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+ "anchor5") },
+ { new Outlink("http://www.nutch.org/g", ""),
+ new Outlink("http://www.nutch.org/g1", ""),
+ new Outlink("http://www.nutch.org/g2", "bla bla"),
+ new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
} catch (MalformedURLException e) {
@@ -318,58 +239,58 @@ public class TestDOMContentUtils {
}
private static boolean equalsIgnoreWhitespace(String s1, String s2) {
- StringTokenizer st1= new StringTokenizer(s1);
- StringTokenizer st2= new StringTokenizer(s2);
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
while (st1.hasMoreTokens()) {
- if (!st2.hasMoreTokens())
+ if (!st2.hasMoreTokens())
return false;
- if ( ! st1.nextToken().equals(st2.nextToken()) )
+ if (!st1.nextToken().equals(st2.nextToken()))
return false;
}
- if (st2.hasMoreTokens())
+ if (st2.hasMoreTokens())
return false;
return true;
}
@Test
public void testGetText() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuffer sb= new StringBuffer();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
utils.getText(sb, testDOMs[i]);
- String text= sb.toString();
- Assert.assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
equalsIgnoreWhitespace(answerText[i], text));
}
}
@Test
public void testGetTitle() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- StringBuffer sb= new StringBuffer();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
utils.getTitle(sb, testDOMs[i]);
- String text= sb.toString();
- Assert.assertTrue("expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator")
- + "got text: "+ text,
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
equalsIgnoreWhitespace(answerTitle[i], text));
}
}
@Test
public void testGetOutlinks() {
- if (testDOMs[0] == null)
+ if (testDOMs[0] == null)
setup();
- for (int i= 0; i < testPages.length; i++) {
- ArrayList<Outlink> outlinks= new ArrayList<Outlink>();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
if (i == SKIP) {
conf.setBoolean("parser.html.form.use_action", false);
utils.setConf(conf);
@@ -378,51 +299,47 @@ public class TestDOMContentUtils {
utils.setConf(conf);
}
utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
- Outlink[] outlinkArr= new Outlink[outlinks.size()];
- outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
compareOutlinks(answerOutlinks[i], outlinkArr);
}
}
private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
- for (int i= 0; i < o.length; i++) {
+ for (int i = 0; i < o.length; i++) {
sb.append(o[i].toString());
sb.append(System.getProperty("line.separator"));
}
}
private static final String outlinksString(Outlink[] o) {
- StringBuffer sb= new StringBuffer();
+ StringBuffer sb = new StringBuffer();
appendOutlinks(sb, o);
return sb.toString();
}
private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
if (o1.length != o2.length) {
- Assert.assertTrue("got wrong number of outlinks (expecting " + o1.length
- + ", got " + o2.length + ")"
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1)
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2)
- + System.getProperty("line.separator"),
- false
- );
+ Assert.assertTrue(
+ "got wrong number of outlinks (expecting " + o1.length + ", got "
+ + o2.length + ")" + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1) + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2) + System.getProperty("line.separator"),
+ false);
}
- for (int i= 0; i < o1.length; i++) {
+ for (int i = 0; i < o1.length; i++) {
if (!o1[i].equals(o2[i])) {
- Assert.assertTrue("got wrong outlinks at position " + i
- + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'"
- + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'",
- false
- );
+ Assert.assertTrue(
+ "got wrong outlinks at position " + i
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+ + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+ + "', anchor: '" + o2[i].getAnchor() + "'", false);
}
}
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Thu Jan 29 05:38:59 2015
@@ -33,69 +33,54 @@ import org.slf4j.LoggerFactory;
public class TestHtmlParser {
- public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(TestHtmlParser.class);
- private static final String encodingTestKeywords =
- "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά";
- private static final String encodingTestBody =
- "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>";
- private static final String encodingTestContent =
- "<title>" + encodingTestKeywords + "</title>\n"
- + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n"
- + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
- private static String[][] encodingTestPages= {
- {
- "HTML4, utf-8, meta http-equiv, no quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
- + encodingTestContent
- },
- {
- "HTML4, utf-8, meta http-equiv, single quotes",
- "utf-8",
- "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
- + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
- + "<html>\n<head>\n"
- + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
- + encodingTestContent
- },
- {
- "XHTML, utf-8, meta http-equiv, double quotes",
- "utf-8",
- "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
- + "<html>\n<head>\n"
- + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
- + encodingTestContent
- },
- {
- "HTML5, utf-8, meta charset",
- "utf-8",
- "<!DOCTYPE html>\n<html>\n<head>\n"
- + "<meta charset=\"utf-8\">"
- + encodingTestContent
- },
- {
- "HTML5, utf-8, BOM",
- "utf-8",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
- + encodingTestContent
- },
- {
- "HTML5, utf-16, BOM",
- "utf-16",
- "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
- + encodingTestContent
- }
- };
+ private static final String encodingTestKeywords = "français, español, ÑÑÑÑкий ÑзÑк, ÄeÅ¡tina, ελληνικά";
+ private static final String encodingTestBody = "<ul>\n <li>français\n <li>español\n <li>ÑÑÑÑкий ÑзÑк\n <li>ÄeÅ¡tina\n <li>ελληνικά\n</ul>";
+ private static final String encodingTestContent = "<title>"
+ + encodingTestKeywords + "</title>\n"
+ + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
+ + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+ private static String[][] encodingTestPages = {
+ {
+ "HTML4, utf-8, meta http-equiv, no quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+ + encodingTestContent },
+ {
+ "HTML4, utf-8, meta http-equiv, single quotes",
+ "utf-8",
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+ + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+ + encodingTestContent },
+ {
+ "XHTML, utf-8, meta http-equiv, double quotes",
+ "utf-8",
+ "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<html>\n<head>\n"
+ + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+ + encodingTestContent },
+ {
+ "HTML5, utf-8, meta charset",
+ "utf-8",
+ "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+ + encodingTestContent },
+ { "HTML5, utf-8, BOM", "utf-8",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+ { "HTML5, utf-16, BOM", "utf-16",
+ "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
private Configuration conf;
private Parser parser;
- public TestHtmlParser() {
+ public TestHtmlParser() {
conf = NutchConfiguration.create();
parser = new HtmlParser();
parser.setConf(conf);
@@ -104,8 +89,8 @@ public class TestHtmlParser {
protected Parse parse(byte[] contentBytes) {
String dummyUrl = "http://dummy.url/";
return parser.getParse(
- new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(),
- conf)).get(dummyUrl);
+ new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+ new Metadata(), conf)).get(dummyUrl);
}
@Test
Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Thu Jan 29 05:38:59 2015
@@ -33,120 +33,96 @@ import org.apache.html.dom.*;
public class TestRobotsMetaProcessor {
/*
-
- some sample tags:
-
- <meta name="robots" content="index,follow">
- <meta name="robots" content="noindex,follow">
- <meta name="robots" content="index,nofollow">
- <meta name="robots" content="noindex,nofollow">
-
- <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-
+ *
+ * some sample tags:
+ *
+ * <meta name="robots" content="index,follow"> <meta name="robots"
+ * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+ * <meta name="robots" content="noindex,nofollow">
+ *
+ * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
*/
+ public static String[] tests = {
+ "<html><head><title>test page</title>"
+ + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+ + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"all\"> "
+ + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+ + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,follow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,follow\"> "
+ + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+ + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+ + " some text" + "</body></html>",
+
+ };
- public static String[] tests=
- {
- "<html><head><title>test page</title>"
- + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
- + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"all\"> "
- + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
- + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"none\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"noindex,follow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,nofollow\"> "
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\" content=\"index,follow\"> "
- + "<base href=\"http://www.nutch.org/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- "<html><head><title>test page</title>"
- + "<meta name=\"robots\"> "
- + "<base href=\"http://www.nutch.org/base/\">"
- + "</head><body>"
- + " some text"
- + "</body></html>",
-
- };
-
- public static final boolean[][] answers= {
- {true, true, true}, // NONE
- {false, false, true}, // all
- {true, true, true}, // nOnE
- {true, true, false}, // none
- {true, true, false}, // noindex,nofollow
- {true, false, false}, // noindex,follow
- {false, true, false}, // index,nofollow
- {false, false, false}, // index,follow
- {false, false, false}, // missing!
+ public static final boolean[][] answers = { { true, true, true }, // NONE
+ { false, false, true }, // all
+ { true, true, true }, // nOnE
+ { true, true, false }, // none
+ { true, true, false }, // noindex,nofollow
+ { true, false, false }, // noindex,follow
+ { false, true, false }, // index,nofollow
+ { false, false, false }, // index,follow
+ { false, false, false }, // missing!
};
private URL[][] currURLsAndAnswers;
@Test
public void testRobotsMetaProcessor() {
- DOMFragmentParser parser= new DOMFragmentParser();;
+ DOMFragmentParser parser = new DOMFragmentParser();
+ ;
- try {
- currURLsAndAnswers= new URL[][] {
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org"), null},
- {new URL("http://www.nutch.org/foo/"),
- new URL("http://www.nutch.org/")},
- {new URL("http://www.nutch.org"),
- new URL("http://www.nutch.org/base/")}
- };
+ try {
+ currURLsAndAnswers = new URL[][] {
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org/foo/"),
+ new URL("http://www.nutch.org/") },
+ { new URL("http://www.nutch.org"),
+ new URL("http://www.nutch.org/base/") } };
} catch (Exception e) {
Assert.assertTrue("couldn't make test URLs!", false);
}
- for (int i= 0; i < tests.length; i++) {
- byte[] bytes= tests[i].getBytes();
+ for (int i = 0; i < tests.length; i++) {
+ byte[] bytes = tests[i].getBytes();
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
@@ -156,9 +132,8 @@ public class TestRobotsMetaProcessor {
e.printStackTrace();
}
- HTMLMetaTags robotsMeta= new HTMLMetaTags();
- HTMLMetaProcessor.getMetaTags(robotsMeta, node,
- currURLsAndAnswers[i][0]);
+ HTMLMetaTags robotsMeta = new HTMLMetaTags();
+ HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
Assert.assertTrue("got index wrong on test " + i,
robotsMeta.getNoIndex() == answers[i][0]);
@@ -166,13 +141,13 @@ public class TestRobotsMetaProcessor {
robotsMeta.getNoFollow() == answers[i][1]);
Assert.assertTrue("got cache wrong on test " + i,
robotsMeta.getNoCache() == answers[i][2]);
- Assert.assertTrue("got base href wrong on test " + i + " (got "
- + robotsMeta.getBaseHref() + ")",
- ( (robotsMeta.getBaseHref() == null)
- && (currURLsAndAnswers[i][1] == null) )
- || ( (robotsMeta.getBaseHref() != null)
- && robotsMeta.getBaseHref().equals(
- currURLsAndAnswers[i][1]) ) );
+ Assert
+ .assertTrue(
+ "got base href wrong on test " + i + " (got "
+ + robotsMeta.getBaseHref() + ")",
+ ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+ || ((robotsMeta.getBaseHref() != null) && robotsMeta
+ .getBaseHref().equals(currURLsAndAnswers[i][1])));
}
}
Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Jan 29 05:38:59 2015
@@ -1,19 +1,19 @@
/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements. See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License. You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.nutch.parse.js;
import java.io.BufferedReader;
@@ -56,9 +56,9 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java
*/
public class JSParseFilter implements HtmlParseFilter, Parser {
public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
@@ -66,9 +66,9 @@ public class JSParseFilter implements Ht
private static final int MAX_TITLE_LEN = 80;
private Configuration conf;
-
+
public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
+ HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
@@ -82,37 +82,42 @@ public class JSParseFilter implements Ht
outlinks.addAll(list);
ParseStatus status = parse.getData().getStatus();
String text = parse.getText();
- Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
- ParseData parseData = new ParseData(status, title, newlinks,
- parse.getData().getContentMeta(),
- parse.getData().getParseMeta());
+ Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
+ .size()]);
+ ParseData parseData = new ParseData(status, title, newlinks, parse
+ .getData().getContentMeta(), parse.getData().getParseMeta());
// replace original parse obj with new one
parseResult.put(content.getUrl(), new ParseText(text), parseData);
}
return parseResult;
}
-
- private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
+
+ private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+ List<Outlink> outlinks) {
if (n instanceof Element) {
String name = n.getNodeName();
if (name.equalsIgnoreCase("script")) {
- /* String lang = null;
- Node lNode = n.getAttributes().getNamedItem("language");
- if (lNode == null) lang = "javascript";
- else lang = lNode.getNodeValue(); */
+ /*
+ * String lang = null; Node lNode =
+ * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
+ * "javascript"; else lang = lNode.getNodeValue();
+ */
StringBuffer script = new StringBuffer();
NodeList nn = n.getChildNodes();
if (nn.getLength() > 0) {
for (int i = 0; i < nn.getLength(); i++) {
- if (i > 0) script.append('\n');
+ if (i > 0)
+ script.append('\n');
script.append(nn.item(i).getNodeValue());
}
// if (LOG.isInfoEnabled()) {
- // LOG.info("script: language=" + lang + ", text: " + script.toString());
+ // LOG.info("script: language=" + lang + ", text: " +
+ // script.toString());
// }
Outlink[] links = getJSLinks(script.toString(), "", base);
- if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ if (links != null && links.length > 0)
+ outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
return;
}
@@ -124,7 +129,8 @@ public class JSParseFilter implements Ht
// Window: onload,onunload
// Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
// Keyboard: onkeydown,onkeypress,onkeyup
- // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+ // Mouse:
+ // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
Node anode = attrs.item(i);
Outlink[] links = null;
if (anode.getNodeName().startsWith("on")) {
@@ -135,7 +141,8 @@ public class JSParseFilter implements Ht
links = getJSLinks(val, "", base);
}
}
- if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+ if (links != null && links.length > 0)
+ outlinks.addAll(Arrays.asList(links));
}
}
}
@@ -144,48 +151,56 @@ public class JSParseFilter implements Ht
walk(nl.item(i), parse, metaTags, base, outlinks);
}
}
-
+
public ParseResult getParse(Content c) {
String type = c.getContentType();
- if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
+ if (type != null && !type.trim().equals("")
+ && !type.toLowerCase().startsWith("application/x-javascript"))
return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
- "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
+ "Content not JavaScript: '" + type + "'").getEmptyParseResult(
+ c.getUrl(), getConf());
String script = new String(c.getContent());
Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
- if (outlinks == null) outlinks = new Outlink[0];
+ if (outlinks == null)
+ outlinks = new Outlink[0];
// Title? use the first line of the script...
String title;
int idx = script.indexOf('\n');
if (idx != -1) {
- if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+ if (idx > MAX_TITLE_LEN)
+ idx = MAX_TITLE_LEN;
title = script.substring(0, idx);
} else {
idx = Math.min(MAX_TITLE_LEN, script.length());
title = script.substring(0, idx);
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
- c.getMetadata());
+ c.getMetadata());
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
-
+
private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
// A simple pattern. This allows also invalid URL characters.
private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
// Alternative pattern, which limits valid url characters.
- //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
-
+ // private static final String URI_PATTERN =
+ // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+
/**
- * This method extracts URLs from literals embedded in JavaScript.
+ * This method extracts URLs from literals embedded in JavaScript.
*/
private Outlink[] getJSLinks(String plainText, String anchor, String base) {
final List<Outlink> outlinks = new ArrayList<Outlink>();
URL baseURL = null;
-
+
try {
baseURL = new URL(base);
} catch (Exception e) {
- if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error("getJSLinks", e);
+ }
}
try {
@@ -194,8 +209,8 @@ public class JSParseFilter implements Ht
Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
| Perl5Compiler.MULTILINE_MASK);
final Pattern pattern1 = cp.compile(URI_PATTERN,
- Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
- | Perl5Compiler.MULTILINE_MASK);
+ Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+ | Perl5Compiler.MULTILINE_MASK);
final PatternMatcher matcher = new Perl5Matcher();
final PatternMatcher matcher1 = new Perl5Matcher();
@@ -204,26 +219,27 @@ public class JSParseFilter implements Ht
MatchResult result;
String url;
- //loop the matches
+ // loop the matches
while (matcher.contains(input, pattern)) {
result = matcher.getMatch();
url = result.group(2);
PatternMatcherInput input1 = new PatternMatcherInput(url);
if (!matcher1.matches(input1, pattern1)) {
- //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
+ // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
+ // }
continue;
}
if (url.startsWith("www.")) {
- url = "http://" + url;
+ url = "http://" + url;
} else {
- // See if candidate URL is parseable. If not, pass and move on to
+ // See if candidate URL is parseable. If not, pass and move on to
// the next match.
try {
url = new URL(baseURL, url).toString();
} catch (MalformedURLException ex) {
if (LOG.isTraceEnabled()) {
- LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
- baseURL + "'", ex);
+ LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+ + baseURL + "'", ex);
}
continue;
}
@@ -237,12 +253,14 @@ public class JSParseFilter implements Ht
} catch (Exception ex) {
// if it is a malformed URL we just throw it away and continue with
// extraction.
- if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error("getJSLinks", ex);
+ }
}
final Outlink[] retval;
- //create array of the Outlinks
+ // create array of the Outlinks
if (outlinks != null && outlinks.size() > 0) {
retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
} else {
@@ -251,7 +269,7 @@ public class JSParseFilter implements Ht
return retval;
}
-
+
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
@@ -261,10 +279,10 @@ public class JSParseFilter implements Ht
BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
StringBuffer sb = new StringBuffer();
String line = null;
- while ((line = br.readLine()) != null)
+ while ((line = br.readLine()) != null)
sb.append(line + "\n");
br.close();
-
+
JSParseFilter parseFilter = new JSParseFilter();
parseFilter.setConf(NutchConfiguration.create());
Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* from JavaScript files and embedded JavaScript code snippets.
*/
package org.apache.nutch.parse.js;
+
Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java Thu Jan 29 05:38:59 2015
@@ -21,3 +21,4 @@
* (see {@link org.apache.nutch.indexer.metadata}).
*/
package org.apache.nutch.parse.metatags;
+
Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Thu Jan 29 05:38:59 2015
@@ -44,11 +44,13 @@ import com.anotherbigidea.io.InStream;
* distribution.
*/
public class SWFParser implements Parser {
- public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.swf");
+ public static final Logger LOG = LoggerFactory
+ .getLogger("org.apache.nutch.parse.swf");
private Configuration conf = null;
- public SWFParser() {}
+ public SWFParser() {
+ }
public void setConf(Configuration conf) {
this.conf = conf;
@@ -68,10 +70,12 @@ public class SWFParser implements Parser
byte[] raw = content.getContent();
String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
- if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
- return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
- "Content truncated at " + raw.length +
- " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+ + " bytes. Parser can't handle incomplete files.")
+ .getEmptyParseResult(content.getUrl(), getConf());
}
ExtractText extractor = new ExtractText();
@@ -87,7 +91,8 @@ public class SWFParser implements Parser
reader.readFile();
text = extractor.getText();
String atext = extractor.getActionText();
- if (atext != null && atext.length() > 0) text += "\n--------\n" + atext;
+ if (atext != null && atext.length() > 0)
+ text += "\n--------\n" + atext;
// harvest potential outlinks
String[] links = extractor.getUrls();
for (int i = 0; i < links.length; i++) {
@@ -95,19 +100,25 @@ public class SWFParser implements Parser
outlinks.add(out);
}
Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
- if (olinks != null) for (int i = 0; i < olinks.length; i++) {
- outlinks.add(olinks[i]);
- }
+ if (olinks != null)
+ for (int i = 0; i < olinks.length; i++) {
+ outlinks.add(olinks[i]);
+ }
} catch (Exception e) { // run time exception
LOG.error("Error, runtime exception: ", e);
- return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
- }
- if (text == null) text = "";
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as SWF document. " + e).getEmptyParseResult(
+ content.getUrl(), getConf());
+ }
+ if (text == null)
+ text = "";
- Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
+ Outlink[] links = (Outlink[]) outlinks
+ .toArray(new Outlink[outlinks.size()]);
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
- content.getMetadata());
- return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
+ content.getMetadata());
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+ parseData));
}
/**
@@ -120,10 +131,9 @@ public class SWFParser implements Parser
in.read(buf);
in.close();
SWFParser parser = new SWFParser();
- ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
- buf, "application/x-shockwave-flash",
- new Metadata(),
- NutchConfiguration.create()));
+ ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
+ "file:" + args[0], buf, "application/x-shockwave-flash",
+ new Metadata(), NutchConfiguration.create()));
Parse p = parseResult.get("file:" + args[0]);
System.out.println("Parse Text:");
System.out.println(p.getText());
@@ -168,7 +178,8 @@ class ExtractText extends SWFTagTypesImp
StringBuffer res = new StringBuffer();
Iterator<String> it = strings.iterator();
while (it.hasNext()) {
- if (res.length() > 0) res.append(' ');
+ if (res.length() > 0)
+ res.append(' ');
res.append(it.next());
}
return res.toString();
@@ -176,10 +187,12 @@ class ExtractText extends SWFTagTypesImp
public String getActionText() {
StringBuffer res = new StringBuffer();
- String[] strings = (String[])actionStrings.toArray(new String[actionStrings.size()]);
+ String[] strings = (String[]) actionStrings
+ .toArray(new String[actionStrings.size()]);
Arrays.sort(strings);
for (int i = 0; i < strings.length; i++) {
- if (i > 0) res.append('\n');
+ if (i > 0)
+ res.append('\n');
res.append(strings[i]);
}
return res.toString();
@@ -196,14 +209,16 @@ class ExtractText extends SWFTagTypesImp
return res;
}
- public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, int arg4) throws IOException {
+ public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
+ int arg4) throws IOException {
tagDefineFontInfo(arg0, arg1, arg2, arg3);
}
/**
* SWFTagTypes interface Save the Text Font character code info
*/
- public void tagDefineFontInfo(int fontId, String fontName, int flags, int[] codes) throws IOException {
+ public void tagDefineFontInfo(int fontId, String fontName, int flags,
+ int[] codes) throws IOException {
// System.out.println("-defineFontInfo id=" + fontId + ", name=" +
// fontName);
fontCodes.put(new Integer(fontId), codes);
@@ -213,16 +228,16 @@ class ExtractText extends SWFTagTypesImp
// XXX codes anyway, so we just give up.
/*
* public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
- * return null;
- * }
+ * return null; }
*/
/**
* SWFTagTypes interface. Save the character code info.
*/
- public SWFVectors tagDefineFont2(int id, int flags, String name, int numGlyphs, int ascent, int descent, int leading,
- int[] codes, int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, int[] kernAdjustments)
- throws IOException {
+ public SWFVectors tagDefineFont2(int id, int flags, String name,
+ int numGlyphs, int ascent, int descent, int leading, int[] codes,
+ int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
+ int[] kernAdjustments) throws IOException {
// System.out.println("-defineFontInfo id=" + id + ", name=" + name);
fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
@@ -232,9 +247,10 @@ class ExtractText extends SWFTagTypesImp
/**
* SWFTagTypes interface. Dump any initial text in the field.
*/
- public void tagDefineTextField(int fieldId, String fieldName, String initialText, Rect boundary, int flags,
- AlphaColor textColor, int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
- int rightMargin, int indentation, int lineSpacing) throws IOException {
+ public void tagDefineTextField(int fieldId, String fieldName,
+ String initialText, Rect boundary, int flags, AlphaColor textColor,
+ int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
+ int rightMargin, int indentation, int lineSpacing) throws IOException {
if (initialText != null) {
strings.add(initialText);
}
@@ -243,7 +259,8 @@ class ExtractText extends SWFTagTypesImp
/**
* SWFTagTypes interface
*/
- public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) throws IOException {
+ public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
+ throws IOException {
lastBounds = curBounds;
curBounds = bounds;
return new TextDumper();
@@ -255,7 +272,8 @@ class ExtractText extends SWFTagTypesImp
/**
* SWFTagTypes interface
*/
- public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) throws IOException {
+ public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
+ throws IOException {
lastBounds = curBounds;
curBounds = bounds;
return new TextDumper();
@@ -273,15 +291,16 @@ class ExtractText extends SWFTagTypesImp
public void setY(int y) {
if (firstY)
firstY = false;
- else strings.add("\n"); // Change in Y - dump a new line
+ else
+ strings.add("\n"); // Change in Y - dump a new line
}
/*
* There are some issues with this method: sometimes SWF files define their
- * own font, so short of OCR we cannot guess what is the glyph code -> character
- * mapping. Additionally, some files don't use literal space character, instead
- * they adjust glyphAdvances. We don't handle it at all - in such cases the text
- * will be all glued together.
+ * own font, so short of OCR we cannot guess what is the glyph code ->
+ * character mapping. Additionally, some files don't use literal space
+ * character, instead they adjust glyphAdvances. We don't handle it at all -
+ * in such cases the text will be all glued together.
*/
public void text(int[] glyphIndices, int[] glyphAdvances) {
// System.out.println("-text id=" + fontId);
@@ -310,9 +329,11 @@ class ExtractText extends SWFTagTypesImp
strings.add(new String(chars));
}
- public void color(Color color) {}
+ public void color(Color color) {
+ }
- public void setX(int x) {}
+ public void setX(int x) {
+ }
public void done() {
strings.add("\n");
@@ -367,7 +388,8 @@ class NutchSWFActions extends SWFActionB
public void lookupTable(String[] values) throws IOException {
for (int i = 0; i < values.length; i++) {
- if (!strings.contains(values[i])) strings.add(values[i]);
+ if (!strings.contains(values[i]))
+ strings.add(values[i]);
}
super.lookupTable(values);
dict = values;
@@ -379,7 +401,7 @@ class NutchSWFActions extends SWFActionB
}
public void getURL(int vars, int mode) {
- // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
+ // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
}
public void getURL(String url, String target) throws IOException {
@@ -444,7 +466,8 @@ class NutchSWFActions extends SWFActionB
super.setTarget(var);
}
- public SWFActionBlock startFunction(String var, String[] params) throws IOException {
+ public SWFActionBlock startFunction(String var, String[] params)
+ throws IOException {
stack.push(var);
strings.remove(var);
if (params != null) {
@@ -455,7 +478,8 @@ class NutchSWFActions extends SWFActionB
return this;
}
- public SWFActionBlock startFunction2(String var, int arg1, int arg2, String[] params, int[] arg3) throws IOException {
+ public SWFActionBlock startFunction2(String var, int arg1, int arg2,
+ String[] params, int[] arg3) throws IOException {
stack.push(var);
strings.remove(var);
if (params != null) {
@@ -655,6 +679,7 @@ class SmallStack extends Stack<Object> {
// tolerate underruns
if (this.size() == 0)
return null;
- else return super.pop();
+ else
+ return super.pop();
}
}
Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Parse Flash SWF files.
*/
package org.apache.nutch.parse.swf;
+
Modified: nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Thu Jan 29 05:38:59 2015
@@ -34,17 +34,19 @@ import org.apache.nutch.util.NutchConfig
import org.junit.Assert;
import org.junit.Test;
-/**
+/**
* Unit tests for SWFParser.
*/
public class TestSWFParser {
private String fileSeparator = System.getProperty("file.separator");
// This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data",".");
+ private String sampleDir = System.getProperty("test.data", ".");
- private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"};
- private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"};
+ private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+ "test3.swf" };
+ private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+ "test3.txt" };
@Test
public void testIt() throws ProtocolException, ParseException {
@@ -58,7 +60,8 @@ public class TestSWFParser {
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content).get(content.getUrl());
@@ -67,11 +70,12 @@ public class TestSWFParser {
}
}
- public TestSWFParser() {
+ public TestSWFParser() {
for (int i = 0; i < sampleFiles.length; i++) {
try {
// read the test string
- FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleTexts[i]);
+ FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+ + sampleTexts[i]);
StringBuffer sb = new StringBuffer();
int len = 0;
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");