You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC

svn commit: r1655526 [17/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Thu Jan 29 05:38:59 2015
@@ -35,189 +35,133 @@ import org.xml.sax.*;
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
 
-/** 
+/**
  * Unit tests for DOMContentUtils.
  */
 public class TestDOMContentUtils {
 
-  private static final String[] testPages= { 
-    new String("<html><head><title> title </title><script> script </script>"
-        + "</head><body> body <a href=\"http://www.nutch.org\">"
-        + " anchor </a><!--comment-->"
-        + "</body></html>"),
-        new String("<html><head><title> title </title><script> script </script>"
-            + "</head><body> body <a href=\"/\">"
-            + " home </a><!--comment-->"
-            + "<style> style </style>"
-            + " <a href=\"bot.html\">"
-            + " bots </a>"
-            + "</body></html>"),
-            new String("<html><head><title> </title>"
-                + "</head><body> "
-                + "<a href=\"/\"> separate this "
-                + "<a href=\"ok\"> from this"
-                + "</a></a>"
-                + "</body></html>"),
-                // this one relies on certain neko fixup behavior, possibly
-                // distributing the anchors into the LI's-but not the other
-                // anchors (outside of them, instead)!  So you get a tree that
-                // looks like:
-                // ... <li> <a href=/> home </a> </li>
-                //     <li> <a href=/> <a href="1"> 1 </a> </a> </li>
-                //     <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
-                new String("<html><head><title> my title </title>"
-                    + "</head><body> body "
-                    + "<ul>"
-                    + "<li> <a href=\"/\"> home"
-                    + "<li> <a href=\"1\"> 1"
-                    + "<li> <a href=\"2\"> 2"
-                    + "</ul>"
-                    + "</body></html>"),
-                    // test frameset link extraction. The invalid frame in the middle will be
-                    // fixed to a third standalone frame.
-                    new String("<html><head><title> my title </title>"
-                        + "</head><frameset rows=\"20,*\"> "
-                        + "<frame src=\"top.html\">"
-                        + "</frame>"
-                        + "<frameset cols=\"20,*\">"
-                        + "<frame src=\"left.html\">"
-                        + "<frame src=\"invalid.html\"/>"
-                        + "</frame>"
-                        + "<frame src=\"right.html\">"
-                        + "</frame>"
-                        + "</frameset>"
-                        + "</frameset>"
-                        + "</body></html>"),
-                        // test <area> and <iframe> link extraction + url normalization
-                        new String("<html><head><title> my title </title>"
-                            + "</head><body>"
-                            + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
-                            + "<map name=\"green\">"
-                            + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
-                            + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
-                            + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
-                            + "</map>"
-                            + "<a name=\"bottom\"/><h1> the bottom </h1> "
-                            + "<iframe src=\"../docs/index.html\"/>"
-                            + "</body></html>"),
-                            // test whitespace processing for plain text extraction
-                            new String("<html><head>\n <title> my\t\n  title\r\n </title>\n"
-                                + " </head>\n"
-                                + " <body>\n"
-                                + "    <h1> Whitespace\ttest  </h1> \n"
-                                + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
-                                + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
-                                + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
-                                + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
-                                + "<table>"
-                                + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
-                                + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
-                                + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
-                                + "</table>put some text here<Br>and there."
-                                + "<h2>End\tthis\rmadness\n!</h2>\r\n"
-                                + "         .        .        .         ."
-                                + "</body>  </html>"),
-
-                                // test that <a rel=nofollow> links are not returned
-                                new String("<html><head></head><body>"
-                                    + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
-                                    + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
-                                    + "</body></html>"),
-                                    // test that POST form actions are skipped
-                                    new String("<html><head></head><body>"
-                                        + "<form method='POST' action='/search.jsp'><input type=text>"
-                                        + "<input type=submit><p>test1</p></form>"
-                                        + "<form method='GET' action='/dummy.jsp'><input type=text>"
-                                        + "<input type=submit><p>test2</p></form></body></html>"),
-                                        // test that all form actions are skipped
-                                        new String("<html><head></head><body>"
-                                            + "<form method='POST' action='/search.jsp'><input type=text>"
-                                            + "<input type=submit><p>test1</p></form>"
-                                            + "<form method='GET' action='/dummy.jsp'><input type=text>"
-                                            + "<input type=submit><p>test2</p></form></body></html>"),
-                                            new String("<html><head><title> title </title>"
-                                                + "</head><body>"
-                                                + "<a href=\";x\">anchor1</a>"
-                                                + "<a href=\"g;x\">anchor2</a>"
-                                                + "<a href=\"g;x?y#s\">anchor3</a>"
-                                                + "</body></html>"),  
-                                                new String("<html><head><title> title </title>"
-                                                    + "</head><body>"
-                                                    + "<a href=\"g\">anchor1</a>"
-                                                    + "<a href=\"g?y#s\">anchor2</a>"
-                                                    + "<a href=\"?y=1\">anchor3</a>"
-                                                    + "<a href=\"?y=1#s\">anchor4</a>"
-                                                    + "<a href=\"?y=1;somethingelse\">anchor5</a>"
-                                                    + "</body></html>"), 
-                                                    new String("<html><head><title> title </title>"
-                                                        + "</head><body>"
-                                                        + "<a href=\"g\"><!--no anchor--></a>"
-                                                        + "<a href=\"g1\"> <!--whitespace-->  </a>"
-                                                        + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
-                                                        + "</body></html>"), 
-  };
+  private static final String[] testPages = {
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"http://www.nutch.org\">"
+          + " anchor </a><!--comment-->" + "</body></html>"),
+      new String("<html><head><title> title </title><script> script </script>"
+          + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+          + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+          + "</body></html>"),
+      new String("<html><head><title> </title>" + "</head><body> "
+          + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+          + "</a></a>" + "</body></html>"),
+      // this one relies on certain neko fixup behavior, possibly
+      // distributing the anchors into the LI's-but not the other
+      // anchors (outside of them, instead)! So you get a tree that
+      // looks like:
+      // ... <li> <a href=/> home </a> </li>
+      // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+      // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+      new String("<html><head><title> my title </title>"
+          + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+          + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+          + "</body></html>"),
+      // test frameset link extraction. The invalid frame in the middle will be
+      // fixed to a third standalone frame.
+      new String("<html><head><title> my title </title>"
+          + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+          + "</frame>" + "<frameset cols=\"20,*\">"
+          + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+          + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+          + "</frameset>" + "</frameset>" + "</body></html>"),
+      // test <area> and <iframe> link extraction + url normalization
+      new String(
+          "<html><head><title> my title </title>"
+              + "</head><body>"
+              + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+              + "<map name=\"green\">"
+              + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+              + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+              + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+              + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+              + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+      // test whitespace processing for plain text extraction
+      new String(
+          "<html><head>\n <title> my\t\n  title\r\n </title>\n"
+              + " </head>\n"
+              + " <body>\n"
+              + "    <h1> Whitespace\ttest  </h1> \n"
+              + "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+              + "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+              + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+              + "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+              + "<table>"
+              + "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+              + "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+              + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+              + "</table>put some text here<Br>and there."
+              + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+              + "         .        .        .         ." + "</body>  </html>"),
+
+      // test that <a rel=nofollow> links are not returned
+      new String("<html><head></head><body>"
+          + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+          + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+          + "</body></html>"),
+      // test that POST form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      // test that all form actions are skipped
+      new String("<html><head></head><body>"
+          + "<form method='POST' action='/search.jsp'><input type=text>"
+          + "<input type=submit><p>test1</p></form>"
+          + "<form method='GET' action='/dummy.jsp'><input type=text>"
+          + "<input type=submit><p>test2</p></form></body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+          + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+          + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+          + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"),
+      new String("<html><head><title> title </title>" + "</head><body>"
+          + "<a href=\"g\"><!--no anchor--></a>"
+          + "<a href=\"g1\"> <!--whitespace-->  </a>"
+          + "<a href=\"g2\">  <img src=test.gif alt='bla bla'> </a>"
+          + "</body></html>"), };
 
   private static int SKIP = 9;
 
-  private static String[] testBaseHrefs= {
-    "http://www.nutch.org",     
-    "http://www.nutch.org/docs/foo.html",     
-    "http://www.nutch.org/docs/",     
-    "http://www.nutch.org/docs/",
-    "http://www.nutch.org/frames/",     
-    "http://www.nutch.org/maps/",
-    "http://www.nutch.org/whitespace/",
-    "http://www.nutch.org//",
-    "http://www.nutch.org/",
-    "http://www.nutch.org/",
-    "http://www.nutch.org/",
-    "http://www.nutch.org/;something",
-    "http://www.nutch.org/"
-  };
-
-  private static final DocumentFragment testDOMs[]=
-      new DocumentFragment[testPages.length];
-
-  private static URL[] testBaseHrefURLs= 
-      new URL[testPages.length];
-
-
-  private static final String[] answerText= {
-    "title body anchor",
-    "title body home bots",
-    "separate this from this",
-    "my title body home 1 2",
-    "my title",
-    "my title the bottom",
-    "my title Whitespace test whitespace test "
-        + "This is a whitespace test . Newlines should appear as space too. "
-        + "Tabs are spaces too. This is a break -> and the line after break . "
-        + "one two three space here space there no space "
-        + "one two two three three four put some text here and there. "
-        + "End this madness ! . . . .",
-        "ignore ignore",
-        "test1 test2",
-        "test1 test2",
-        "title anchor1 anchor2 anchor3",
-        "title anchor1 anchor2 anchor3 anchor4 anchor5",
-        "title"
-  };
-
-  private static final String[] answerTitle= {
-    "title",
-    "title",
-    "",
-    "my title",
-    "my title",
-    "my title",
-    "my title",
-    "",
-    "",
-    "",
-    "title",
-    "title",
-    "title"
-  };
+  private static String[] testBaseHrefs = { "http://www.nutch.org",
+      "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+      "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+      "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+      "http://www.nutch.org//", "http://www.nutch.org/",
+      "http://www.nutch.org/", "http://www.nutch.org/",
+      "http://www.nutch.org/;something", "http://www.nutch.org/" };
+
+  private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+  private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+  private static final String[] answerText = {
+      "title body anchor",
+      "title body home bots",
+      "separate this from this",
+      "my title body home 1 2",
+      "my title",
+      "my title the bottom",
+      "my title Whitespace test whitespace test "
+          + "This is a whitespace test . Newlines should appear as space too. "
+          + "Tabs are spaces too. This is a break -> and the line after break . "
+          + "one two three space here space there no space "
+          + "one two two three three four put some text here and there. "
+          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+      "test1 test2", "title anchor1 anchor2 anchor3",
+      "title anchor1 anchor2 anchor3 anchor4 anchor5", "title" };
+
+  private static final String[] answerTitle = { "title", "title", "",
+      "my title", "my title", "my title", "my title", "", "", "", "title",
+      "title", "title" };
 
   // note: should be in page-order
   private static Outlink[][] answerOutlinks;
@@ -230,87 +174,64 @@ public class TestDOMContentUtils {
     conf = NutchConfiguration.create();
     conf.setBoolean("parser.html.form.use_action", true);
     utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser= new DOMFragmentParser();
+    DOMFragmentParser parser = new DOMFragmentParser();
     try {
-      parser.setFeature(
-          "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
-          true);
-    } catch (SAXException e) {}
-    for (int i= 0; i < testPages.length; i++) {
-      DocumentFragment node= 
-          new HTMLDocumentImpl().createDocumentFragment();
+      parser
+          .setFeature(
+              "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+              true);
+    } catch (SAXException e) {
+    }
+    for (int i = 0; i < testPages.length; i++) {
+      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
       try {
         parser.parse(
-            new InputSource( 
-                new ByteArrayInputStream(testPages[i].getBytes()) ),
-                node);
-        testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            node);
+        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
       } catch (Exception e) {
         Assert.assertTrue("caught exception: " + e, false);
-      } 
-      testDOMs[i]= node;
+      }
+      testDOMs[i] = node;
     }
     try {
-      answerOutlinks = new Outlink[][]{ 
-          {
-            new Outlink("http://www.nutch.org", "anchor"),
-          },
-          {
-            new Outlink("http://www.nutch.org/", "home"),
-            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
-          },
-          {
-            new Outlink("http://www.nutch.org/", "separate this"),
-            new Outlink("http://www.nutch.org/docs/ok", "from this"),
-          },
-          {
-            new Outlink("http://www.nutch.org/", "home"),
-            new Outlink("http://www.nutch.org/docs/1", "1"),
-            new Outlink("http://www.nutch.org/docs/2", "2"),
-          },
-          {
-            new Outlink("http://www.nutch.org/frames/top.html", ""),
-            new Outlink("http://www.nutch.org/frames/left.html", ""),
-            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
-            new Outlink("http://www.nutch.org/frames/right.html", ""),
-          },
-          {
-            new Outlink("http://www.nutch.org/maps/logo.gif", ""),
-            new Outlink("http://www.nutch.org/index.html", ""),
-            new Outlink("http://www.nutch.org/maps/#bottom", ""),
-            new Outlink("http://www.nutch.org/bot.html", ""),
-            new Outlink("http://www.nutch.org/docs/index.html", ""),
-          },
-          {
-            new Outlink("http://www.nutch.org/index.html", "whitespace test"),
-          },
-          {
-          },
-          {
-            new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
-          },
-          {
-          },
-          {
-            new Outlink("http://www.nutch.org/;x", "anchor1"),
-            new Outlink("http://www.nutch.org/g;x", "anchor2"),
-            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
-          },
-          {
-            // this is tricky - see RFC3986 section 5.4.1 example 7
-            new Outlink("http://www.nutch.org/g", "anchor1"),
-            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
-            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
-            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
-            new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
-          },
-          {
-            new Outlink("http://www.nutch.org/g", ""),
-            new Outlink("http://www.nutch.org/g1", ""),
-            new Outlink("http://www.nutch.org/g2", "bla bla"),
-            new Outlink("http://www.nutch.org/test.gif", "bla bla"),
-          }
-      };
+      answerOutlinks = new Outlink[][] {
+          { new Outlink("http://www.nutch.org", "anchor"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+          { new Outlink("http://www.nutch.org/", "separate this"),
+              new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+          { new Outlink("http://www.nutch.org/", "home"),
+              new Outlink("http://www.nutch.org/docs/1", "1"),
+              new Outlink("http://www.nutch.org/docs/2", "2"), },
+          { new Outlink("http://www.nutch.org/frames/top.html", ""),
+              new Outlink("http://www.nutch.org/frames/left.html", ""),
+              new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+              new Outlink("http://www.nutch.org/frames/right.html", ""), },
+          { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+              new Outlink("http://www.nutch.org/index.html", ""),
+              new Outlink("http://www.nutch.org/maps/#bottom", ""),
+              new Outlink("http://www.nutch.org/bot.html", ""),
+              new Outlink("http://www.nutch.org/docs/index.html", ""), },
+          { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+          {},
+          { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+          {},
+          { new Outlink("http://www.nutch.org/;x", "anchor1"),
+              new Outlink("http://www.nutch.org/g;x", "anchor2"),
+              new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+          {
+              // this is tricky - see RFC3986 section 5.4.1 example 7
+              new Outlink("http://www.nutch.org/g", "anchor1"),
+              new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+              new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+              new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+              new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+                  "anchor5") },
+          { new Outlink("http://www.nutch.org/g", ""),
+              new Outlink("http://www.nutch.org/g1", ""),
+              new Outlink("http://www.nutch.org/g2", "bla bla"),
+              new Outlink("http://www.nutch.org/test.gif", "bla bla"), } };
 
     } catch (MalformedURLException e) {
 
@@ -318,58 +239,58 @@ public class TestDOMContentUtils {
   }
 
   private static boolean equalsIgnoreWhitespace(String s1, String s2) {
-    StringTokenizer st1= new StringTokenizer(s1);
-    StringTokenizer st2= new StringTokenizer(s2);
+    StringTokenizer st1 = new StringTokenizer(s1);
+    StringTokenizer st2 = new StringTokenizer(s2);
 
     while (st1.hasMoreTokens()) {
-      if (!st2.hasMoreTokens()) 
+      if (!st2.hasMoreTokens())
         return false;
-      if ( ! st1.nextToken().equals(st2.nextToken()) )
+      if (!st1.nextToken().equals(st2.nextToken()))
         return false;
     }
-    if (st2.hasMoreTokens()) 
+    if (st2.hasMoreTokens())
       return false;
     return true;
   }
 
   @Test
   public void testGetText() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuffer sb= new StringBuffer();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
       utils.getText(sb, testDOMs[i]);
-      String text= sb.toString();
-      Assert.assertTrue("expecting text: " + answerText[i] 
-          + System.getProperty("line.separator") 
-          + System.getProperty("line.separator") 
-          + "got text: "+ text, 
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
           equalsIgnoreWhitespace(answerText[i], text));
     }
   }
 
   @Test
   public void testGetTitle() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      StringBuffer sb= new StringBuffer();
+    for (int i = 0; i < testPages.length; i++) {
+      StringBuffer sb = new StringBuffer();
       utils.getTitle(sb, testDOMs[i]);
-      String text= sb.toString();
-      Assert.assertTrue("expecting text: " + answerText[i] 
-          + System.getProperty("line.separator") 
-          + System.getProperty("line.separator") 
-          + "got text: "+ text, 
+      String text = sb.toString();
+      Assert.assertTrue(
+          "expecting text: " + answerText[i]
+              + System.getProperty("line.separator")
+              + System.getProperty("line.separator") + "got text: " + text,
           equalsIgnoreWhitespace(answerTitle[i], text));
     }
   }
 
   @Test
   public void testGetOutlinks() {
-    if (testDOMs[0] == null) 
+    if (testDOMs[0] == null)
       setup();
-    for (int i= 0; i < testPages.length; i++) {
-      ArrayList<Outlink> outlinks= new ArrayList<Outlink>();
+    for (int i = 0; i < testPages.length; i++) {
+      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
       if (i == SKIP) {
         conf.setBoolean("parser.html.form.use_action", false);
         utils.setConf(conf);
@@ -378,51 +299,47 @@ public class TestDOMContentUtils {
         utils.setConf(conf);
       }
       utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
-      Outlink[] outlinkArr= new Outlink[outlinks.size()];
-      outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
+      Outlink[] outlinkArr = new Outlink[outlinks.size()];
+      outlinkArr = (Outlink[]) outlinks.toArray(outlinkArr);
       compareOutlinks(answerOutlinks[i], outlinkArr);
     }
   }
 
   private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
-    for (int i= 0; i < o.length; i++) {
+    for (int i = 0; i < o.length; i++) {
       sb.append(o[i].toString());
       sb.append(System.getProperty("line.separator"));
     }
   }
 
   private static final String outlinksString(Outlink[] o) {
-    StringBuffer sb= new StringBuffer();
+    StringBuffer sb = new StringBuffer();
     appendOutlinks(sb, o);
     return sb.toString();
   }
 
   private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
     if (o1.length != o2.length) {
-      Assert.assertTrue("got wrong number of outlinks (expecting " + o1.length 
-          + ", got " + o2.length + ")" 
-          + System.getProperty("line.separator") 
-          + "answer: " + System.getProperty("line.separator") 
-          + outlinksString(o1) 
-          + System.getProperty("line.separator") 
-          + "got: " + System.getProperty("line.separator") 
-          + outlinksString(o2)
-          + System.getProperty("line.separator"),
-          false
-          );
+      Assert.assertTrue(
+          "got wrong number of outlinks (expecting " + o1.length + ", got "
+              + o2.length + ")" + System.getProperty("line.separator")
+              + "answer: " + System.getProperty("line.separator")
+              + outlinksString(o1) + System.getProperty("line.separator")
+              + "got: " + System.getProperty("line.separator")
+              + outlinksString(o2) + System.getProperty("line.separator"),
+          false);
     }
 
-    for (int i= 0; i < o1.length; i++) {
+    for (int i = 0; i < o1.length; i++) {
       if (!o1[i].equals(o2[i])) {
-        Assert.assertTrue("got wrong outlinks at position " + i
-            + System.getProperty("line.separator") 
-            + "answer: " + System.getProperty("line.separator") 
-            + "'" + o1[i].getToUrl() + "', anchor: '" + o1[i].getAnchor() + "'"
-            + System.getProperty("line.separator") 
-            + "got: " + System.getProperty("line.separator") 
-            + "'" + o2[i].getToUrl() + "', anchor: '" + o2[i].getAnchor() + "'",
-            false
-            );
+        Assert.assertTrue(
+            "got wrong outlinks at position " + i
+                + System.getProperty("line.separator") + "answer: "
+                + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+                + "', anchor: '" + o1[i].getAnchor() + "'"
+                + System.getProperty("line.separator") + "got: "
+                + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+                + "', anchor: '" + o2[i].getAnchor() + "'", false);
 
       }
     }

Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java Thu Jan 29 05:38:59 2015
@@ -33,69 +33,54 @@ import org.slf4j.LoggerFactory;
 
 public class TestHtmlParser {
 
-  public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TestHtmlParser.class);
 
-  private static final String encodingTestKeywords = 
-      "français, español, русский язык, čeština, ελληνικά";
-  private static final String encodingTestBody =
-      "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
-  private static final String encodingTestContent =
-      "<title>" + encodingTestKeywords + "</title>\n"
-          + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n"
-          + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
-
-  private static String[][] encodingTestPages= {
-    { 
-      "HTML4, utf-8, meta http-equiv, no quotes",
-      "utf-8",
-      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-          + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
-          + encodingTestContent
-    },
-    { 
-      "HTML4, utf-8, meta http-equiv, single quotes",
-      "utf-8",
-      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
-          + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
-          + encodingTestContent
-    },
-    { 
-      "XHTML, utf-8, meta http-equiv, double quotes",
-      "utf-8",
-      "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
-          + "<html>\n<head>\n"
-          + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-8, meta charset",
-      "utf-8",
-      "<!DOCTYPE html>\n<html>\n<head>\n"
-          + "<meta charset=\"utf-8\">"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-8, BOM",
-      "utf-8",
-      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
-          + encodingTestContent
-    },
-    { 
-      "HTML5, utf-16, BOM",
-      "utf-16",
-      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
-          + encodingTestContent
-    }
-  };
+  private static final String encodingTestKeywords = "français, español, русский язык, čeština, ελληνικά";
+  private static final String encodingTestBody = "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords
+      + "</meta>\n" + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent },
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent } };
 
   private Configuration conf;
   private Parser parser;
 
-  public TestHtmlParser() { 
+  public TestHtmlParser() {
     conf = NutchConfiguration.create();
     parser = new HtmlParser();
     parser.setConf(conf);
@@ -104,8 +89,8 @@ public class TestHtmlParser {
   protected Parse parse(byte[] contentBytes) {
     String dummyUrl = "http://dummy.url/";
     return parser.getParse(
-        new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(),
-            conf)).get(dummyUrl);
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+            new Metadata(), conf)).get(dummyUrl);
   }
 
   @Test

Modified: nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Thu Jan 29 05:38:59 2015
@@ -33,120 +33,96 @@ import org.apache.html.dom.*;
 public class TestRobotsMetaProcessor {
 
   /*
-
-  some sample tags:
-
-  <meta name="robots" content="index,follow">
-  <meta name="robots" content="noindex,follow">
-  <meta name="robots" content="index,nofollow">
-  <meta name="robots" content="noindex,nofollow">
-
-  <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
-
+   * 
+   * some sample tags:
+   * 
+   * <meta name="robots" content="index,follow"> <meta name="robots"
+   * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+   * <meta name="robots" content="noindex,nofollow">
+   * 
+   * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
    */
 
+  public static String[] tests = {
+      "<html><head><title>test page</title>"
+          + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+          + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"all\"> "
+          + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+          + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"noindex,follow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,nofollow\"> "
+          + "</head><body>" + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>"
+          + "<meta name=\"robots\" content=\"index,follow\"> "
+          + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+      "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+          + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+          + " some text" + "</body></html>",
+
+  };
 
-  public static String[] tests= 
-    {
-    "<html><head><title>test page</title>"
-        + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
-        + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
-        + "</head><body>"
-        + " some text"
-        + "</body></html>",
-
-        "<html><head><title>test page</title>"
-            + "<meta name=\"robots\" content=\"all\"> "
-            + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
-            + "</head><body>"
-            + " some text"
-            + "</body></html>",
-
-            "<html><head><title>test page</title>"
-                + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
-                + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
-                + "</head><body>"
-                + " some text"
-                + "</body></html>",
-
-                "<html><head><title>test page</title>"
-                    + "<meta name=\"robots\" content=\"none\"> "
-                    + "</head><body>"
-                    + " some text"
-                    + "</body></html>",
-
-                    "<html><head><title>test page</title>"
-                        + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
-                        + "</head><body>"
-                        + " some text"
-                        + "</body></html>",
-
-                        "<html><head><title>test page</title>"
-                            + "<meta name=\"robots\" content=\"noindex,follow\"> "
-                            + "</head><body>"
-                            + " some text"
-                            + "</body></html>",
-
-                            "<html><head><title>test page</title>"
-                                + "<meta name=\"robots\" content=\"index,nofollow\"> "
-                                + "</head><body>"
-                                + " some text"
-                                + "</body></html>",
-
-                                "<html><head><title>test page</title>"
-                                    + "<meta name=\"robots\" content=\"index,follow\"> "
-                                    + "<base href=\"http://www.nutch.org/\">"
-                                    + "</head><body>"
-                                    + " some text"
-                                    + "</body></html>",
-
-                                    "<html><head><title>test page</title>"
-                                        + "<meta name=\"robots\"> "
-                                        + "<base href=\"http://www.nutch.org/base/\">"
-                                        + "</head><body>"
-                                        + " some text"
-                                        + "</body></html>",
-
-    };
-
-  public static final boolean[][] answers= {
-    {true, true, true},     // NONE
-    {false, false, true},   // all
-    {true, true, true},     // nOnE
-    {true, true, false},    // none
-    {true, true, false},    // noindex,nofollow
-    {true, false, false},   // noindex,follow
-    {false, true, false},   // index,nofollow
-    {false, false, false},  // index,follow
-    {false, false, false},  // missing!
+  public static final boolean[][] answers = { { true, true, true }, // NONE
+      { false, false, true }, // all
+      { true, true, true }, // nOnE
+      { true, true, false }, // none
+      { true, true, false }, // noindex,nofollow
+      { true, false, false }, // noindex,follow
+      { false, true, false }, // index,nofollow
+      { false, false, false }, // index,follow
+      { false, false, false }, // missing!
   };
 
   private URL[][] currURLsAndAnswers;
 
   @Test
   public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser= new DOMFragmentParser();;
+    DOMFragmentParser parser = new DOMFragmentParser();
+    ;
 
-    try { 
-      currURLsAndAnswers= new URL[][] {
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org"), null},
-          {new URL("http://www.nutch.org/foo/"), 
-            new URL("http://www.nutch.org/")},
-            {new URL("http://www.nutch.org"), 
-              new URL("http://www.nutch.org/base/")}
-      };
+    try {
+      currURLsAndAnswers = new URL[][] {
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org"), null },
+          { new URL("http://www.nutch.org/foo/"),
+              new URL("http://www.nutch.org/") },
+          { new URL("http://www.nutch.org"),
+              new URL("http://www.nutch.org/base/") } };
     } catch (Exception e) {
       Assert.assertTrue("couldn't make test URLs!", false);
     }
 
-    for (int i= 0; i < tests.length; i++) {
-      byte[] bytes= tests[i].getBytes();
+    for (int i = 0; i < tests.length; i++) {
+      byte[] bytes = tests[i].getBytes();
 
       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
 
@@ -156,9 +132,8 @@ public class TestRobotsMetaProcessor {
         e.printStackTrace();
       }
 
-      HTMLMetaTags robotsMeta= new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, 
-          currURLsAndAnswers[i][0]);
+      HTMLMetaTags robotsMeta = new HTMLMetaTags();
+      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
 
       Assert.assertTrue("got index wrong on test " + i,
           robotsMeta.getNoIndex() == answers[i][0]);
@@ -166,13 +141,13 @@ public class TestRobotsMetaProcessor {
           robotsMeta.getNoFollow() == answers[i][1]);
       Assert.assertTrue("got cache wrong on test " + i,
           robotsMeta.getNoCache() == answers[i][2]);
-      Assert.assertTrue("got base href wrong on test " + i + " (got "
-          + robotsMeta.getBaseHref() + ")",
-          ( (robotsMeta.getBaseHref() == null)
-              && (currURLsAndAnswers[i][1] == null) )
-              || ( (robotsMeta.getBaseHref() != null)
-                  && robotsMeta.getBaseHref().equals(
-                      currURLsAndAnswers[i][1]) ) );
+      Assert
+          .assertTrue(
+              "got base href wrong on test " + i + " (got "
+                  + robotsMeta.getBaseHref() + ")",
+              ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+                  || ((robotsMeta.getBaseHref() != null) && robotsMeta
+                      .getBaseHref().equals(currURLsAndAnswers[i][1])));
 
     }
   }

Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Thu Jan 29 05:38:59 2015
@@ -1,19 +1,19 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one or more
-* contributor license agreements.  See the NOTICE file distributed with
-* this work for additional information regarding copyright ownership.
-* The ASF licenses this file to You under the Apache License, Version 2.0
-* (the "License"); you may not use this file except in compliance with
-* the License.  You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.nutch.parse.js;
 
 import java.io.BufferedReader;
@@ -56,9 +56,9 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 /**
- * This class is a heuristic link extractor for JavaScript files and
- * code snippets. The general idea of a two-pass regex matching comes from
- * Heritrix. Parts of the code come from OutlinkExtractor.java
+ * This class is a heuristic link extractor for JavaScript files and code
+ * snippets. The general idea of a two-pass regex matching comes from Heritrix.
+ * Parts of the code come from OutlinkExtractor.java
  */
 public class JSParseFilter implements HtmlParseFilter, Parser {
   public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);
@@ -66,9 +66,9 @@ public class JSParseFilter implements Ht
   private static final int MAX_TITLE_LEN = 80;
 
   private Configuration conf;
-  
+
   public ParseResult filter(Content content, ParseResult parseResult,
-    HTMLMetaTags metaTags, DocumentFragment doc) {
+      HTMLMetaTags metaTags, DocumentFragment doc) {
 
     Parse parse = parseResult.get(content.getUrl());
 
@@ -82,37 +82,42 @@ public class JSParseFilter implements Ht
       outlinks.addAll(list);
       ParseStatus status = parse.getData().getStatus();
       String text = parse.getText();
-      Outlink[] newlinks = (Outlink[])outlinks.toArray(new Outlink[outlinks.size()]);
-      ParseData parseData = new ParseData(status, title, newlinks,
-                                          parse.getData().getContentMeta(),
-                                          parse.getData().getParseMeta());
+      Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
+          .size()]);
+      ParseData parseData = new ParseData(status, title, newlinks, parse
+          .getData().getContentMeta(), parse.getData().getParseMeta());
 
       // replace original parse obj with new one
       parseResult.put(content.getUrl(), new ParseText(text), parseData);
     }
     return parseResult;
   }
-  
-  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
+
+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
+      List<Outlink> outlinks) {
     if (n instanceof Element) {
       String name = n.getNodeName();
       if (name.equalsIgnoreCase("script")) {
- /*       String lang = null;
-        Node lNode = n.getAttributes().getNamedItem("language");
-        if (lNode == null) lang = "javascript";
-        else lang = lNode.getNodeValue(); */
+        /*
+         * String lang = null; Node lNode =
+         * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
+         * "javascript"; else lang = lNode.getNodeValue();
+         */
         StringBuffer script = new StringBuffer();
         NodeList nn = n.getChildNodes();
         if (nn.getLength() > 0) {
           for (int i = 0; i < nn.getLength(); i++) {
-            if (i > 0) script.append('\n');
+            if (i > 0)
+              script.append('\n');
             script.append(nn.item(i).getNodeValue());
           }
           // if (LOG.isInfoEnabled()) {
-          //   LOG.info("script: language=" + lang + ", text: " + script.toString());
+          // LOG.info("script: language=" + lang + ", text: " +
+          // script.toString());
           // }
           Outlink[] links = getJSLinks(script.toString(), "", base);
-          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
           return;
         }
@@ -124,7 +129,8 @@ public class JSParseFilter implements Ht
           // Window: onload,onunload
           // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
           // Keyboard: onkeydown,onkeypress,onkeyup
-          // Mouse: onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
+          // Mouse:
+          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
           Node anode = attrs.item(i);
           Outlink[] links = null;
           if (anode.getNodeName().startsWith("on")) {
@@ -135,7 +141,8 @@ public class JSParseFilter implements Ht
               links = getJSLinks(val, "", base);
             }
           }
-          if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
+          if (links != null && links.length > 0)
+            outlinks.addAll(Arrays.asList(links));
         }
       }
     }
@@ -144,48 +151,56 @@ public class JSParseFilter implements Ht
       walk(nl.item(i), parse, metaTags, base, outlinks);
     }
   }
-  
+
   public ParseResult getParse(Content c) {
     String type = c.getContentType();
-    if (type != null && !type.trim().equals("") && !type.toLowerCase().startsWith("application/x-javascript"))
+    if (type != null && !type.trim().equals("")
+        && !type.toLowerCase().startsWith("application/x-javascript"))
       return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
-              "Content not JavaScript: '" + type + "'").getEmptyParseResult(c.getUrl(), getConf());
+          "Content not JavaScript: '" + type + "'").getEmptyParseResult(
+          c.getUrl(), getConf());
     String script = new String(c.getContent());
     Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
-    if (outlinks == null) outlinks = new Outlink[0];
+    if (outlinks == null)
+      outlinks = new Outlink[0];
     // Title? use the first line of the script...
     String title;
     int idx = script.indexOf('\n');
     if (idx != -1) {
-      if (idx > MAX_TITLE_LEN) idx = MAX_TITLE_LEN;
+      if (idx > MAX_TITLE_LEN)
+        idx = MAX_TITLE_LEN;
       title = script.substring(0, idx);
     } else {
       idx = Math.min(MAX_TITLE_LEN, script.length());
       title = script.substring(0, idx);
     }
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
-                                 c.getMetadata());
+        c.getMetadata());
     return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
   }
-  
+
   private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
   // A simple pattern. This allows also invalid URL characters.
   private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";
+
   // Alternative pattern, which limits valid url characters.
-  //private static final String URI_PATTERN = "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
-  
+  // private static final String URI_PATTERN =
+  // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";
+
   /**
-   *  This method extracts URLs from literals embedded in JavaScript.
+   * This method extracts URLs from literals embedded in JavaScript.
    */
   private Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
     final List<Outlink> outlinks = new ArrayList<Outlink>();
     URL baseURL = null;
-    
+
     try {
       baseURL = new URL(base);
     } catch (Exception e) {
-      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); }
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", e);
+      }
     }
 
     try {
@@ -194,8 +209,8 @@ public class JSParseFilter implements Ht
           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
               | Perl5Compiler.MULTILINE_MASK);
       final Pattern pattern1 = cp.compile(URI_PATTERN,
-              Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-                  | Perl5Compiler.MULTILINE_MASK);
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
       final PatternMatcher matcher = new Perl5Matcher();
 
       final PatternMatcher matcher1 = new Perl5Matcher();
@@ -204,26 +219,27 @@ public class JSParseFilter implements Ht
       MatchResult result;
       String url;
 
-      //loop the matches
+      // loop the matches
       while (matcher.contains(input, pattern)) {
         result = matcher.getMatch();
         url = result.group(2);
         PatternMatcherInput input1 = new PatternMatcherInput(url);
         if (!matcher1.matches(input1, pattern1)) {
-          //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
+          // if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'");
+          // }
           continue;
         }
         if (url.startsWith("www.")) {
-            url = "http://" + url;
+          url = "http://" + url;
         } else {
-          // See if candidate URL is parseable.  If not, pass and move on to
+          // See if candidate URL is parseable. If not, pass and move on to
           // the next match.
           try {
             url = new URL(baseURL, url).toString();
           } catch (MalformedURLException ex) {
             if (LOG.isTraceEnabled()) {
-              LOG.trace(" - failed URL parse '" + url + "' and baseURL '" +
-                  baseURL + "'", ex);
+              LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
+                  + baseURL + "'", ex);
             }
             continue;
           }
@@ -237,12 +253,14 @@ public class JSParseFilter implements Ht
     } catch (Exception ex) {
       // if it is a malformed URL we just throw it away and continue with
       // extraction.
-      if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getJSLinks", ex);
+      }
     }
 
     final Outlink[] retval;
 
-    //create array of the Outlinks
+    // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
     } else {
@@ -251,7 +269,7 @@ public class JSParseFilter implements Ht
 
     return retval;
   }
-  
+
   public static void main(String[] args) throws Exception {
     if (args.length < 2) {
       System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
@@ -261,10 +279,10 @@ public class JSParseFilter implements Ht
     BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
     StringBuffer sb = new StringBuffer();
     String line = null;
-    while ((line = br.readLine()) != null) 
+    while ((line = br.readLine()) != null)
       sb.append(line + "\n");
     br.close();
-    
+
     JSParseFilter parseFilter = new JSParseFilter();
     parseFilter.setConf(NutchConfiguration.create());
     Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);

Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * from JavaScript files and embedded JavaScript code snippets.
  */
 package org.apache.nutch.parse.js;
+

Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java Thu Jan 29 05:38:59 2015
@@ -21,3 +21,4 @@
  * (see {@link org.apache.nutch.indexer.metadata}).
  */
 package org.apache.nutch.parse.metatags;
+

Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Thu Jan 29 05:38:59 2015
@@ -44,11 +44,13 @@ import com.anotherbigidea.io.InStream;
  * distribution.
  */
 public class SWFParser implements Parser {
-  public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.swf");
+  public static final Logger LOG = LoggerFactory
+      .getLogger("org.apache.nutch.parse.swf");
 
   private Configuration conf = null;
 
-  public SWFParser() {}
+  public SWFParser() {
+  }
 
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -68,10 +70,12 @@ public class SWFParser implements Parser
       byte[] raw = content.getContent();
 
       String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
-      if (contentLength != null && raw.length != Integer.parseInt(contentLength)) {
-        return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
-                               "Content truncated at " + raw.length +
-                               " bytes. Parser can't handle incomplete files.").getEmptyParseResult(content.getUrl(), getConf());
+      if (contentLength != null
+          && raw.length != Integer.parseInt(contentLength)) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+                + " bytes. Parser can't handle incomplete files.")
+            .getEmptyParseResult(content.getUrl(), getConf());
       }
       ExtractText extractor = new ExtractText();
 
@@ -87,7 +91,8 @@ public class SWFParser implements Parser
       reader.readFile();
       text = extractor.getText();
       String atext = extractor.getActionText();
-      if (atext != null && atext.length() > 0) text += "\n--------\n" + atext;
+      if (atext != null && atext.length() > 0)
+        text += "\n--------\n" + atext;
       // harvest potential outlinks
       String[] links = extractor.getUrls();
       for (int i = 0; i < links.length; i++) {
@@ -95,19 +100,25 @@ public class SWFParser implements Parser
         outlinks.add(out);
       }
       Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
-      if (olinks != null) for (int i = 0; i < olinks.length; i++) {
-        outlinks.add(olinks[i]);
-      }
+      if (olinks != null)
+        for (int i = 0; i < olinks.length; i++) {
+          outlinks.add(olinks[i]);
+        }
     } catch (Exception e) { // run time exception
       LOG.error("Error, runtime exception: ", e);
-      return new ParseStatus(ParseStatus.FAILED, "Can't be handled as SWF document. " + e).getEmptyParseResult(content.getUrl(), getConf());
-    } 
-    if (text == null) text = "";
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as SWF document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+    if (text == null)
+      text = "";
 
-    Outlink[] links = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
+    Outlink[] links = (Outlink[]) outlinks
+        .toArray(new Outlink[outlinks.size()]);
     ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
-                                        content.getMetadata());
-    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
+        content.getMetadata());
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+        parseData));
   }
 
   /**
@@ -120,10 +131,9 @@ public class SWFParser implements Parser
     in.read(buf);
     in.close();
     SWFParser parser = new SWFParser();
-    ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],
-                                          buf, "application/x-shockwave-flash",
-                                          new Metadata(),
-                                          NutchConfiguration.create()));
+    ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
+        "file:" + args[0], buf, "application/x-shockwave-flash",
+        new Metadata(), NutchConfiguration.create()));
     Parse p = parseResult.get("file:" + args[0]);
     System.out.println("Parse Text:");
     System.out.println(p.getText());
@@ -168,7 +178,8 @@ class ExtractText extends SWFTagTypesImp
     StringBuffer res = new StringBuffer();
     Iterator<String> it = strings.iterator();
     while (it.hasNext()) {
-      if (res.length() > 0) res.append(' ');
+      if (res.length() > 0)
+        res.append(' ');
       res.append(it.next());
     }
     return res.toString();
@@ -176,10 +187,12 @@ class ExtractText extends SWFTagTypesImp
 
   public String getActionText() {
     StringBuffer res = new StringBuffer();
-    String[] strings = (String[])actionStrings.toArray(new String[actionStrings.size()]);
+    String[] strings = (String[]) actionStrings
+        .toArray(new String[actionStrings.size()]);
     Arrays.sort(strings);
     for (int i = 0; i < strings.length; i++) {
-      if (i > 0) res.append('\n');
+      if (i > 0)
+        res.append('\n');
       res.append(strings[i]);
     }
     return res.toString();
@@ -196,14 +209,16 @@ class ExtractText extends SWFTagTypesImp
     return res;
   }
 
-  public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3, int arg4) throws IOException {
+  public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
+      int arg4) throws IOException {
     tagDefineFontInfo(arg0, arg1, arg2, arg3);
   }
 
   /**
    * SWFTagTypes interface Save the Text Font character code info
    */
-  public void tagDefineFontInfo(int fontId, String fontName, int flags, int[] codes) throws IOException {
+  public void tagDefineFontInfo(int fontId, String fontName, int flags,
+      int[] codes) throws IOException {
     // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
     // fontName);
     fontCodes.put(new Integer(fontId), codes);
@@ -213,16 +228,16 @@ class ExtractText extends SWFTagTypesImp
   // XXX codes anyway, so we just give up.
   /*
    * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
-   *    return null;
-   * }
+   * return null; }
    */
 
   /**
    * SWFTagTypes interface. Save the character code info.
    */
-  public SWFVectors tagDefineFont2(int id, int flags, String name, int numGlyphs, int ascent, int descent, int leading,
-          int[] codes, int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2, int[] kernAdjustments)
-          throws IOException {
+  public SWFVectors tagDefineFont2(int id, int flags, String name,
+      int numGlyphs, int ascent, int descent, int leading, int[] codes,
+      int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
+      int[] kernAdjustments) throws IOException {
     // System.out.println("-defineFontInfo id=" + id + ", name=" + name);
     fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
 
@@ -232,9 +247,10 @@ class ExtractText extends SWFTagTypesImp
   /**
    * SWFTagTypes interface. Dump any initial text in the field.
    */
-  public void tagDefineTextField(int fieldId, String fieldName, String initialText, Rect boundary, int flags,
-          AlphaColor textColor, int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
-          int rightMargin, int indentation, int lineSpacing) throws IOException {
+  public void tagDefineTextField(int fieldId, String fieldName,
+      String initialText, Rect boundary, int flags, AlphaColor textColor,
+      int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
+      int rightMargin, int indentation, int lineSpacing) throws IOException {
     if (initialText != null) {
       strings.add(initialText);
     }
@@ -243,7 +259,8 @@ class ExtractText extends SWFTagTypesImp
   /**
    * SWFTagTypes interface
    */
-  public SWFText tagDefineText(int id, Rect bounds, Matrix matrix) throws IOException {
+  public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
+      throws IOException {
     lastBounds = curBounds;
     curBounds = bounds;
     return new TextDumper();
@@ -255,7 +272,8 @@ class ExtractText extends SWFTagTypesImp
   /**
    * SWFTagTypes interface
    */
-  public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix) throws IOException {
+  public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
+      throws IOException {
     lastBounds = curBounds;
     curBounds = bounds;
     return new TextDumper();
@@ -273,15 +291,16 @@ class ExtractText extends SWFTagTypesImp
     public void setY(int y) {
       if (firstY)
         firstY = false;
-      else strings.add("\n"); // Change in Y - dump a new line
+      else
+        strings.add("\n"); // Change in Y - dump a new line
     }
 
     /*
      * There are some issues with this method: sometimes SWF files define their
-     * own font, so short of OCR we cannot guess what is the glyph code -> character
-     * mapping. Additionally, some files don't use literal space character, instead
-     * they adjust glyphAdvances. We don't handle it at all - in such cases the text
-     * will be all glued together.
+     * own font, so short of OCR we cannot guess what is the glyph code ->
+     * character mapping. Additionally, some files don't use literal space
+     * character, instead they adjust glyphAdvances. We don't handle it at all -
+     * in such cases the text will be all glued together.
      */
     public void text(int[] glyphIndices, int[] glyphAdvances) {
       // System.out.println("-text id=" + fontId);
@@ -310,9 +329,11 @@ class ExtractText extends SWFTagTypesImp
       strings.add(new String(chars));
     }
 
-    public void color(Color color) {}
+    public void color(Color color) {
+    }
 
-    public void setX(int x) {}
+    public void setX(int x) {
+    }
 
     public void done() {
       strings.add("\n");
@@ -367,7 +388,8 @@ class NutchSWFActions extends SWFActionB
 
   public void lookupTable(String[] values) throws IOException {
     for (int i = 0; i < values.length; i++) {
-      if (!strings.contains(values[i])) strings.add(values[i]);
+      if (!strings.contains(values[i]))
+        strings.add(values[i]);
     }
     super.lookupTable(values);
     dict = values;
@@ -379,7 +401,7 @@ class NutchSWFActions extends SWFActionB
   }
 
   public void getURL(int vars, int mode) {
-  // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
+    // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
   }
 
   public void getURL(String url, String target) throws IOException {
@@ -444,7 +466,8 @@ class NutchSWFActions extends SWFActionB
     super.setTarget(var);
   }
 
-  public SWFActionBlock startFunction(String var, String[] params) throws IOException {
+  public SWFActionBlock startFunction(String var, String[] params)
+      throws IOException {
     stack.push(var);
     strings.remove(var);
     if (params != null) {
@@ -455,7 +478,8 @@ class NutchSWFActions extends SWFActionB
     return this;
   }
 
-  public SWFActionBlock startFunction2(String var, int arg1, int arg2, String[] params, int[] arg3) throws IOException {
+  public SWFActionBlock startFunction2(String var, int arg1, int arg2,
+      String[] params, int[] arg3) throws IOException {
     stack.push(var);
     strings.remove(var);
     if (params != null) {
@@ -655,6 +679,7 @@ class SmallStack extends Stack<Object> {
     // tolerate underruns
     if (this.size() == 0)
       return null;
-    else return super.pop();
+    else
+      return super.pop();
   }
 }

Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Parse Flash SWF files.
  */
 package org.apache.nutch.parse.swf;
+

Modified: nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original)
+++ nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Thu Jan 29 05:38:59 2015
@@ -34,17 +34,19 @@ import org.apache.nutch.util.NutchConfig
 import org.junit.Assert;
 import org.junit.Test;
 
-/** 
+/**
  * Unit tests for SWFParser.
  */
 public class TestSWFParser {
 
   private String fileSeparator = System.getProperty("file.separator");
   // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data",".");
+  private String sampleDir = System.getProperty("test.data", ".");
 
-  private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"};
-  private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"};
+  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+      "test3.swf" };
+  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+      "test3.txt" };
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
@@ -58,7 +60,8 @@ public class TestSWFParser {
       urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
 
       protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
 
       parse = new ParseUtil(conf).parse(content).get(content.getUrl());
 
@@ -67,11 +70,12 @@ public class TestSWFParser {
     }
   }
 
-  public TestSWFParser() { 
+  public TestSWFParser() {
     for (int i = 0; i < sampleFiles.length; i++) {
       try {
         // read the test string
-        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator + sampleTexts[i]);
+        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+            + sampleTexts[i]);
         StringBuffer sb = new StringBuffer();
         int len = 0;
         InputStreamReader isr = new InputStreamReader(fis, "UTF-8");