You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/08/30 16:54:43 UTC

[2/3] nutch git commit: NUTCH-2264 Forbidden APIs are Checked at Build

NUTCH-2264 Forbidden APIs are Checked at Build


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a671540a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a671540a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a671540a

Branch: refs/heads/2.x
Commit: a671540a94d8afafd72a09396c97d9ede43a7ea2
Parents: 9ecdc9b
Author: Furkan KAMACI <fu...@gmail.com>
Authored: Mon Aug 29 18:24:50 2016 +0300
Committer: Furkan KAMACI <fu...@gmail.com>
Committed: Mon Aug 29 18:44:25 2016 +0300

----------------------------------------------------------------------
 build.xml                                       | 29 ++++++++++++++++++++
 ivy/ivy.xml                                     |  2 ++
 .../org/apache/nutch/api/impl/JobWorker.java    |  9 +++---
 .../nutch/api/resources/AdminResource.java      |  3 +-
 .../nutch/api/resources/SeedResource.java       |  9 +++---
 .../org/apache/nutch/crawl/DbUpdaterJob.java    |  3 +-
 .../org/apache/nutch/crawl/GeneratorJob.java    |  3 +-
 .../org/apache/nutch/crawl/InjectorJob.java     |  5 ++--
 .../org/apache/nutch/fetcher/FetcherJob.java    |  3 +-
 .../apache/nutch/fetcher/FetcherReducer.java    |  8 +++---
 .../apache/nutch/host/HostDbUpdateReducer.java  |  5 ++--
 .../org/apache/nutch/host/HostInjectorJob.java  |  3 +-
 .../org/apache/nutch/net/URLFilterChecker.java  |  5 ++--
 .../apache/nutch/net/URLNormalizerChecker.java  |  5 ++--
 src/java/org/apache/nutch/parse/ParseUtil.java  |  8 ++++--
 src/java/org/apache/nutch/parse/ParserJob.java  |  3 +-
 src/java/org/apache/nutch/protocol/Content.java |  3 +-
 .../apache/nutch/protocol/RobotRulesParser.java |  6 ++--
 src/java/org/apache/nutch/tools/Benchmark.java  |  3 +-
 src/java/org/apache/nutch/tools/DmozParser.java |  5 ++--
 .../org/apache/nutch/tools/ResolveUrls.java     |  8 +++---
 .../apache/nutch/tools/arc/ArcRecordReader.java |  3 +-
 .../apache/nutch/tools/proxy/FakeHandler.java   | 11 ++++----
 src/java/org/apache/nutch/util/Bytes.java       |  3 +-
 .../org/apache/nutch/util/EncodingDetector.java |  7 +++--
 src/java/org/apache/nutch/util/TimingUtil.java  |  3 +-
 src/java/org/apache/nutch/util/URLUtil.java     |  5 ++--
 .../nutch/util/domain/DomainStatistics.java     |  3 +-
 .../nutch/webui/client/impl/RemoteCommand.java  |  6 ++--
 .../creativecommons/nutch/CCParseFilter.java    |  7 +++--
 .../indexer/anchor/AnchorIndexingFilter.java    |  3 +-
 .../nutch/indexer/html/HtmlIndexingFilter.java  |  3 +-
 .../indexer/more/TestMoreIndexingFilter.java    |  3 +-
 .../nutch/analysis/lang/HTMLLanguageParser.java |  9 +++---
 .../analysis/lang/TestHTMLLanguageParser.java   |  5 ++--
 .../nutch/protocol/http/api/HttpBase.java       |  3 +-
 .../protocol/http/api/HttpRobotRulesParser.java |  5 ++--
 .../protocol/http/api/TestRobotRulesParser.java | 10 ++++---
 .../nutch/urlfilter/api/RegexURLFilterBase.java |  6 ++--
 .../urlfilter/api/RegexURLFilterBaseTest.java   | 12 +++++---
 .../nutch/microformats/reltag/RelTagParser.java |  3 +-
 .../microformats/reltag/TestRelTagParser.java   |  3 +-
 .../nutch/parse/html/DOMContentUtils.java       |  3 +-
 .../nutch/parse/html/HTMLMetaProcessor.java     | 13 +++++----
 .../nutch/parse/html/TestDOMContentUtils.java   |  3 +-
 .../parse/html/TestRobotsMetaProcessor.java     |  3 +-
 .../apache/nutch/parse/js/JSParseFilter.java    |  5 ++--
 .../nutch/parse/metatags/MetaTagsParser.java    |  3 +-
 .../parse/metatags/TestMetaTagsParser.java      |  7 +++--
 .../nutch/parse/tika/DOMContentUtils.java       |  4 ++-
 .../nutch/parse/tika/HTMLMetaProcessor.java     | 13 +++++----
 .../nutch/parse/tika/DOMContentUtilsTest.java   |  3 +-
 .../nutch/parse/tika/TestImageMetadata.java     |  5 ++--
 .../org/apache/nutch/protocol/file/File.java    |  3 +-
 .../nutch/protocol/file/FileResponse.java       |  3 +-
 .../org/apache/nutch/protocol/ftp/Client.java   |  3 +-
 .../java/org/apache/nutch/protocol/ftp/Ftp.java |  3 +-
 .../apache/nutch/protocol/ftp/FtpResponse.java  |  5 ++--
 .../nutch/protocol/ftp/FtpRobotRulesParser.java |  5 ++--
 .../nutch/protocol/http/HttpResponse.java       |  5 ++--
 .../httpclient/HttpBasicAuthentication.java     |  5 ++--
 .../org/apache/nutch/protocol/sftp/Sftp.java    |  3 +-
 .../scoring/opic/TestOPICScoringFilter.java     |  8 +++++-
 .../nutch/collection/TestSubcollection.java     |  3 +-
 .../nutch/urlfilter/domain/DomainURLFilter.java |  8 ++++--
 .../nutch/urlfilter/prefix/PrefixURLFilter.java |  3 +-
 .../nutch/urlfilter/suffix/SuffixURLFilter.java | 15 ++++++----
 .../urlnormalizer/basic/BasicURLNormalizer.java | 12 +++-----
 .../urlnormalizer/regex/RegexURLNormalizer.java |  7 +++--
 .../regex/TestRegexURLNormalizer.java           |  5 ++--
 .../apache/nutch/parse/TestSitemapParser.java   | 20 ++------------
 .../apache/nutch/plugin/TestPluginSystem.java   | 12 ++++----
 .../apache/nutch/util/TestEncodingDetector.java | 12 ++++----
 .../org/apache/nutch/util/TestGZIPUtils.java    | 25 +++++++++--------
 .../org/apache/nutch/util/TestNodeWalker.java   |  3 +-
 75 files changed, 286 insertions(+), 185 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 061d0b7..f051838 100644
--- a/build.xml
+++ b/build.xml
@@ -63,6 +63,35 @@
  </path>
 
  <!-- ====================================================== -->
+ <!-- Forbidden APIs Targets -->
+ <!-- ====================================================== -->
+
+ <!-- will be used by forbiddenapis -->
+ <path id="all-lib-classpath">
+   <fileset dir="${build.lib.dir}">
+     <include name="**/*.jar" />
+   </fileset>
+   <fileset dir="${runtime.dir}/local/lib">
+     <include name="**/*.jar" />
+   </fileset>
+   <fileset dir="${runtime.dir}/local/plugins">
+     <include name="**/*.jar" />
+   </fileset>
+ </path>
+
+ <!-- forbiddenapis target -->
+ <target name="precommit" depends="runtime, test">
+   <!-- forbiddenapis task definition -->
+   <taskdef name="forbiddenapis" classname="de.thetaphi.forbiddenapis.ant.AntTask" classpath="${build.lib.dir}/forbiddenapis-2.2.jar"/>
+
+   <forbiddenapis classpathref="all-lib-classpath" dir="${build.dir}" targetVersion="${javac.version}">
+     <bundledsignatures name="jdk-unsafe"/>
+     <bundledsignatures name="jdk-deprecated"/>
+     <bundledsignatures name="jdk-non-portable"/>
+   </forbiddenapis>
+ </target>
+
+ <!-- ====================================================== -->
  <!-- Stuff needed by all targets -->
  <!-- ====================================================== -->
  <target name="init" depends="ivy-init" description="--> stuff required by all targets">

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index db42162..e173e71 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -81,6 +81,8 @@
     <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3" conf="*->default" />
     <dependency org="org.restlet.jee" name="org.restlet.ext.crypto" rev="2.2.3" conf="*->default" />
 
+    <dependency org="de.thetaphi" name="forbiddenapis" rev="2.2" conf="*->default"/>
+
     <!--artifacts needed for testing -->
     <dependency org="junit" name="junit" rev="4.11" conf="*->default" />
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/impl/JobWorker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/impl/JobWorker.java b/src/java/org/apache/nutch/api/impl/JobWorker.java
index 9c7c5c2..8ac78cc 100644
--- a/src/java/org/apache/nutch/api/impl/JobWorker.java
+++ b/src/java/org/apache/nutch/api/impl/JobWorker.java
@@ -17,6 +17,7 @@
 package org.apache.nutch.api.impl;
 
 import java.text.MessageFormat;
+import java.util.Locale;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.api.model.request.JobConfig;
@@ -49,11 +50,11 @@ public class JobWorker implements Runnable {
 
   private String generateId() {
     if (jobConfig.getCrawlId() == null) {
-      return MessageFormat.format("{0}-{1}-{2}", jobConfig.getConfId(),
-          jobConfig.getType(), String.valueOf(hashCode()));
+      return new MessageFormat("{0}-{1}-{2}", Locale.ROOT)
+          .format(new Object[] {jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())});
     }
-    return MessageFormat.format("{0}-{1}-{2}-{3}", jobConfig.getCrawlId(),
-        jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode()));
+    return new MessageFormat("{0}-{1}-{2}-{3}", Locale.ROOT)
+        .format(new Object[] {jobConfig.getCrawlId(), jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())});
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/AdminResource.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/resources/AdminResource.java b/src/java/org/apache/nutch/api/resources/AdminResource.java
index cfbf8d5..6e93f11 100644
--- a/src/java/org/apache/nutch/api/resources/AdminResource.java
+++ b/src/java/org/apache/nutch/api/resources/AdminResource.java
@@ -18,6 +18,7 @@ package org.apache.nutch.api.resources;
 
 import java.text.MessageFormat;
 import java.util.Date;
+import java.util.Locale;
 import java.util.concurrent.TimeUnit;
 
 import javax.ws.rs.GET;
@@ -72,7 +73,7 @@ public class AdminResource extends AbstractResource {
     }
 
     scheduleServerStop();
-    return MessageFormat.format("Stopping in {0} seconds.", DELAY_SEC);
+    return new MessageFormat("Stopping in {0} seconds.", Locale.ROOT).format(DELAY_SEC);
   }
 
   private void scheduleServerStop() {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/SeedResource.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/resources/SeedResource.java b/src/java/org/apache/nutch/api/resources/SeedResource.java
index d7439e0..472c842 100644
--- a/src/java/org/apache/nutch/api/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/api/resources/SeedResource.java
@@ -20,9 +20,10 @@ import static javax.ws.rs.core.Response.status;
 
 import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
+import java.io.OutputStreamWriter;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 
 import javax.ws.rs.Consumes;
@@ -90,9 +91,7 @@ public class SeedResource extends AbstractResource {
 
   private BufferedWriter getWriter(File seedFile) {
     try {
-      return new BufferedWriter(new FileWriter(seedFile));
-    } catch (FileNotFoundException e) {
-      throw handleException(e);
+      return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seedFile), StandardCharsets.UTF_8));
     } catch (IOException e) {
       throw handleException(e);
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
index 4b1618c..3885b68 100644
--- a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
+++ b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
@@ -19,6 +19,7 @@ package org.apache.nutch.crawl;
 import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.HashSet;
+import java.util.Locale;
 import java.util.Map;
 
 import org.apache.avro.util.Utf8;
@@ -129,7 +130,7 @@ public class DbUpdaterJob extends NutchTool implements Tool {
 
   private int updateTable(String crawlId, String batchId) throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("DbUpdaterJob: starting at " + sdf.format(start));
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/GeneratorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index e06a192..f47637f 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -25,6 +25,7 @@ import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 import java.util.Collection;
+import java.util.Locale;
 
 import org.apache.hadoop.mapreduce.Job;
 import org.slf4j.Logger;
@@ -255,7 +256,7 @@ public class GeneratorJob extends NutchTool implements Tool {
   public String generate(long topN, long curTime, boolean filter, boolean norm,
       boolean sitemap) throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("GeneratorJob: starting at {}", sdf.format(start));
     LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/InjectorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/InjectorJob.java b/src/java/org/apache/nutch/crawl/InjectorJob.java
index 5094b0f..df91a73 100644
--- a/src/java/org/apache/nutch/crawl/InjectorJob.java
+++ b/src/java/org/apache/nutch/crawl/InjectorJob.java
@@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
 import java.util.*;
 
@@ -181,7 +182,7 @@ public class InjectorJob extends NutchTool implements Tool {
           String keymd = keysIter.next();
           String valuemd = metadata.get(keymd);
           row.getMetadata().put(new Utf8(keymd),
-              ByteBuffer.wrap(valuemd.getBytes()));
+              ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8)));
         }
 
         if (customScore != -1)
@@ -260,7 +261,7 @@ public class InjectorJob extends NutchTool implements Tool {
   }
 
   public void inject(Path urlDir) throws Exception {
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("InjectorJob: starting at " + sdf.format(start));
     LOG.info("InjectorJob: Injecting urlDir: " + urlDir);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index 268d9f6..a7f3df8 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java
@@ -22,6 +22,7 @@ import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Random;
+import java.util.Locale;
 import org.apache.avro.util.Utf8;
 import org.apache.gora.filter.FilterOp;
 import org.apache.gora.filter.MapFieldValueFilter;
@@ -278,7 +279,7 @@ public class FetcherJob extends NutchTool implements Tool {
   public int fetch(String batchId, int threads, boolean shouldResume,
       int numTasks, boolean stmDetect, boolean sitemap) throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("FetcherJob: starting at " + sdf.format(start));
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 8ee7477..68b982d 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
@@ -106,7 +106,7 @@ public class FetcherReducer extends
         LOG.warn("Cannot parse url: " + url, e);
         return null;
       }
-      final String proto = u.getProtocol().toLowerCase();
+      final String proto = u.getProtocol().toLowerCase(Locale.ROOT);
       String host;
       if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
         try {
@@ -131,7 +131,7 @@ public class FetcherReducer extends
           host = u.toExternalForm();
         }
       }
-      queueID = proto + "://" + host.toLowerCase();
+      queueID = proto + "://" + host.toLowerCase(Locale.ROOT);
       return new FetchItem(url, page, u, queueID);
     }
 
@@ -639,8 +639,8 @@ public class FetcherReducer extends
       }
 
       if (ignoreExternalLinks) {
-        String toHost = new URL(newUrl).getHost().toLowerCase();
-        String fromHost = new URL(url).getHost().toLowerCase();
+        String toHost = new URL(newUrl).getHost().toLowerCase(Locale.ROOT);
+        String fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT);
         if (toHost == null || !toHost.equals(fromHost)) {
           // external links
           return;

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
index 933f546..3043543 100644
--- a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
+++ b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.URLUtil;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.Set;
 
 /**
@@ -78,10 +79,10 @@ public class HostDbUpdateReducer extends
     // output host data
     Host host = new Host();
     host.getMetadata().put(new Utf8("p"),
-        ByteBuffer.wrap(Integer.toString(numPages).getBytes()));
+        ByteBuffer.wrap(Integer.toString(numPages).getBytes(StandardCharsets.UTF_8)));
     if (numFetched > 0) {
       host.getMetadata().put(new Utf8("f"),
-          ByteBuffer.wrap(Integer.toString(numFetched).getBytes()));
+          ByteBuffer.wrap(Integer.toString(numFetched).getBytes(StandardCharsets.UTF_8)));
     }
     for (String inlink : inlinkCount.getKeys()) {
       host.getInlinks().put(new Utf8(inlink),

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostInjectorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/host/HostInjectorJob.java b/src/java/org/apache/nutch/host/HostInjectorJob.java
index 12cdf28..83f247c 100644
--- a/src/java/org/apache/nutch/host/HostInjectorJob.java
+++ b/src/java/org/apache/nutch/host/HostInjectorJob.java
@@ -40,6 +40,7 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 
 /**
@@ -124,7 +125,7 @@ public class HostInjectorJob implements Tool {
         String keymd = keysIter.next();
         String valuemd = metadata.get(keymd);
         host.getMetadata().put(new Utf8(keymd),
-            ByteBuffer.wrap(valuemd.getBytes()));
+            ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8)));
       }
       String hostname;
       if (url.indexOf("://") > -1) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLFilterChecker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 21ffd03..ee4daf7 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration;
 
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 
 /**
  * Checks one given filter or all filters.
@@ -71,7 +72,7 @@ public class URLFilterChecker {
 
     System.out.println("Checking URLFilter " + filterName);
 
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
     String line;
     while ((line = in.readLine()) != null) {
       String out = filter.filter(line);
@@ -88,7 +89,7 @@ public class URLFilterChecker {
   private void checkAll() throws Exception {
     System.out.println("Checking combination of all URLFilters available");
 
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
     String line;
     while ((line = in.readLine()) != null) {
       URLFilters filters = new URLFilters(this.conf);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index d8f1c6e..b1ec60f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration;
 
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 
 /**
  * Checks one given normalizer or all normalizers.
@@ -66,7 +67,7 @@ public class URLNormalizerChecker {
 
     System.out.println("Checking URLNormalizer " + normalizerName);
 
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
     String line;
     while ((line = in.readLine()) != null) {
       String out = normalizer.normalize(line, scope);
@@ -77,7 +78,7 @@ public class URLNormalizerChecker {
   private void checkAll(String scope) throws Exception {
     System.out.println("Checking combination of all URLNormalizers available");
 
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
     String line;
     URLNormalizers normalizers = new URLNormalizers(conf, scope);
     while ((line = in.readLine()) != null) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParseUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java
index 8a37063..a38fb0a 100644
--- a/src/java/org/apache/nutch/parse/ParseUtil.java
+++ b/src/java/org/apache/nutch/parse/ParseUtil.java
@@ -45,6 +45,8 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
@@ -243,7 +245,7 @@ public class ParseUtil extends Configured {
           for (Map.Entry<String, String[]> metadata : metaDatas) {
             System.out.println();
             newRow.getMetadata().put(new Utf8(metadata.getKey()),
-                ByteBuffer.wrap(metadata.getValue()[0].getBytes()));
+                ByteBuffer.wrap(metadata.getValue()[0].getBytes(StandardCharsets.UTF_8)));
           }
 
           int changeFrequency = calculateFetchInterval(
@@ -362,7 +364,7 @@ public class ParseUtil extends Configured {
         String fromHost;
         if (ignoreExternalLinks) {
           try {
-            fromHost = new URL(url).getHost().toLowerCase();
+            fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT);
           } catch (final MalformedURLException e) {
             fromHost = null;
           }
@@ -382,7 +384,7 @@ public class ParseUtil extends Configured {
           String toHost;
           if (ignoreExternalLinks) {
             try {
-              toHost = new URL(toUrl).getHost().toLowerCase();
+              toHost = new URL(toUrl).getHost().toLowerCase(Locale.ROOT);
             } catch (final MalformedURLException e) {
               toHost = null;
             }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParserJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParserJob.java b/src/java/org/apache/nutch/parse/ParserJob.java
index a021879..9762a00 100644
--- a/src/java/org/apache/nutch/parse/ParserJob.java
+++ b/src/java/org/apache/nutch/parse/ParserJob.java
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer;
 import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.HashSet;
+import java.util.Locale;
 import java.util.Map;
 
 import org.apache.avro.util.Utf8;
@@ -304,7 +305,7 @@ public class ParserJob extends NutchTool implements Tool {
       boolean sitemap)
           throws Exception {
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("ParserJob: starting at {}", sdf.format(start));
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/Content.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/Content.java b/src/java/org/apache/nutch/protocol/Content.java
index f4c4098..77f9b51 100755
--- a/src/java/org/apache/nutch/protocol/Content.java
+++ b/src/java/org/apache/nutch/protocol/Content.java
@@ -23,6 +23,7 @@ import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.zip.InflaterInputStream;
 
@@ -265,7 +266,7 @@ public final class Content implements Writable {
     buffer.append("contentType: " + contentType + "\n");
     buffer.append("metadata: " + metadata + "\n");
     buffer.append("Content:\n");
-    buffer.append(new String(content)); // try default encoding
+    buffer.append(new String(content, StandardCharsets.UTF_8)); // try default encoding
 
     return buffer.toString();
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 16e380d..867b71b 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -19,9 +19,11 @@ package org.apache.nutch.protocol;
 
 // JDK imports
 import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Hashtable;
 import java.util.StringTokenizer;
 
@@ -172,7 +174,7 @@ public abstract class RobotRulesParser implements Configurable {
       BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
           "text/plain", argv[2]);
 
-      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+      LineNumberReader testsIn = new LineNumberReader(new InputStreamReader(new FileInputStream(argv[1]), StandardCharsets.UTF_8));
       String testPath = testsIn.readLine().trim();
       while (testPath != null) {
         System.out.println((rules.isAllowed(testPath) ? "allowed"

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/Benchmark.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java
index 6643ba3..68c1755 100644
--- a/src/java/org/apache/nutch/tools/Benchmark.java
+++ b/src/java/org/apache/nutch/tools/Benchmark.java
@@ -17,6 +17,7 @@
 package org.apache.nutch.tools;
 
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -55,7 +56,7 @@ public class Benchmark extends Configured implements Tool {
     OutputStream os = fs.create(new Path(seedsDir, "seeds"));
     for (int i = 0; i < count; i++) {
       String url = "http://www.test-" + i + ".com/\r\n";
-      os.write(url.getBytes());
+      os.write(url.getBytes(StandardCharsets.UTF_8));
     }
     os.flush();
     os.close();

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/DmozParser.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/DmozParser.java b/src/java/org/apache/nutch/tools/DmozParser.java
index ae63505..03d2662 100644
--- a/src/java/org/apache/nutch/tools/DmozParser.java
+++ b/src/java/org/apache/nutch/tools/DmozParser.java
@@ -19,6 +19,7 @@ package org.apache.nutch.tools;
 
 import java.io.*;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.regex.*;
 
@@ -195,12 +196,12 @@ public class DmozParser {
               if (row != null) {
                 if (desc.length() > 0) {
                   row.getMetadata().put(new Utf8("_dmoz_desc_"),
-                      ByteBuffer.wrap(desc.toString().getBytes()));
+                      ByteBuffer.wrap(desc.toString().getBytes(StandardCharsets.UTF_8)));
                   desc.delete(0, desc.length());
                 }
                 if (title.length() > 0) {
                   row.getMetadata().put(new Utf8("_dmoz_title_"),
-                      ByteBuffer.wrap(title.toString().getBytes()));
+                      ByteBuffer.wrap(title.toString().getBytes(StandardCharsets.UTF_8)));
                   title.delete(0, title.length());
                 }
                 store.put(reversedUrl, row);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/ResolveUrls.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/ResolveUrls.java b/src/java/org/apache/nutch/tools/ResolveUrls.java
index fe8c24f..8c8bf97 100644
--- a/src/java/org/apache/nutch/tools/ResolveUrls.java
+++ b/src/java/org/apache/nutch/tools/ResolveUrls.java
@@ -17,9 +17,10 @@
 package org.apache.nutch.tools;
 
 import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 import java.net.InetAddress;
+import java.nio.charset.StandardCharsets;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
@@ -102,8 +103,7 @@ public class ResolveUrls {
       pool = Executors.newFixedThreadPool(numThreads);
 
       // read in the urls file and loop through each line, one url per line
-      BufferedReader buffRead = new BufferedReader(new FileReader(new File(
-          urlsFile)));
+      BufferedReader buffRead = new BufferedReader(new InputStreamReader(new FileInputStream(urlsFile), StandardCharsets.UTF_8));
       String urlStr = null;
       while ((urlStr = buffRead.readLine()) != null) {
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index 2b6a3f9..d3f9799 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -18,6 +18,7 @@ package org.apache.nutch.tools.arc;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.zip.GZIPInputStream;
 
 import org.slf4j.Logger;
@@ -269,7 +270,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
         }
 
         // create the header and the raw content minus the header
-        String header = new String(content, 0, eol).trim();
+        String header = new String(content, 0, eol, StandardCharsets.UTF_8).trim();
         byte[] raw = new byte[(content.length - eol) - 1];
         System.arraycopy(content, eol + 1, raw, 0, raw.length);
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
index fce2d3b..699cfa3 100644
--- a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
+++ b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
@@ -35,6 +35,7 @@ package org.apache.nutch.tools.proxy;
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicLong;
 
@@ -118,7 +119,7 @@ public class FakeHandler extends AbstractTestbedHandler {
       os.write(bytes);
       // record URI
       String p = "<p>URI: " + uri + "</p>\r\n";
-      os.write(p.getBytes());
+      os.write(p.getBytes(StandardCharsets.UTF_8));
       // fake some links
       String basePath;
       String baseDomain;
@@ -142,7 +143,7 @@ public class FakeHandler extends AbstractTestbedHandler {
           link += pageSeq.getAndIncrement() + ".html'>";
         }
         link += "outlink " + i + "</a></p>\r\n";
-        os.write(link.getBytes());
+        os.write(link.getBytes(StandardCharsets.UTF_8));
       }
       baseDomain = u.getHost();
       // chop off the TLD
@@ -160,15 +161,15 @@ public class FakeHandler extends AbstractTestbedHandler {
           link = "http://" + host + "/";
         }
         link = "<p><a href='" + link + "'>fake host " + host + "</a></p>\r\n";
-        os.write(link.getBytes());
+        os.write(link.getBytes(StandardCharsets.UTF_8));
       }
       // fake a link to the root URL
       link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
       if (u.getPort() != 80 && u.getPort() != -1)
         link += ":" + u.getPort();
       link += "/'>site " + u.getHost() + "</a></p>\r\n";
-      os.write(link.getBytes());
-      os.write(testB.getBytes());
+      os.write(link.getBytes(StandardCharsets.UTF_8));
+      os.write(testB.getBytes(StandardCharsets.UTF_8));
       res.flushBuffer();
     } catch (IOException ioe) {
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/Bytes.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java
index 87323a6..db9f468 100644
--- a/src/java/org/apache/nutch/util/Bytes.java
+++ b/src/java/org/apache/nutch/util/Bytes.java
@@ -28,6 +28,7 @@ import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.Comparator;
 import java.util.Iterator;
+import java.util.Locale;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -396,7 +397,7 @@ public class Bytes {
             || " `~!@#$%^&*()-_=+[]{}\\|;:'\",.<>/?".indexOf(ch) >= 0) {
           result.append(first.charAt(i));
         } else {
-          result.append(String.format("\\x%02X", ch));
+          result.append(String.format(Locale.ROOT, "\\x%02X", ch));
         }
       }
     } catch (UnsupportedEncodingException e) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/EncodingDetector.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
index ff6cf00..5b40e29 100644
--- a/src/java/org/apache/nutch/util/EncodingDetector.java
+++ b/src/java/org/apache/nutch/util/EncodingDetector.java
@@ -32,6 +32,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 
 /**
  * A simple class for detecting character encodings.
@@ -72,7 +73,7 @@ public class EncodingDetector {
     }
 
     public EncodingClue(String value, String source, int confidence) {
-      this.value = value.toLowerCase();
+      this.value = value.toLowerCase(Locale.ROOT);
       this.source = source;
       this.confidence = confidence;
     }
@@ -269,7 +270,7 @@ public class EncodingDetector {
           LOG.trace(baseUrl + ": Choosing encoding: " + charset
               + " with confidence " + clue.confidence);
         }
-        return resolveEncodingAlias(charset).toLowerCase();
+        return resolveEncodingAlias(charset).toLowerCase(Locale.ROOT);
       } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
         bestClue = clue;
       }
@@ -278,7 +279,7 @@ public class EncodingDetector {
     if (LOG.isTraceEnabled()) {
       LOG.trace(baseUrl + ": Choosing encoding: " + bestClue);
     }
-    return bestClue.value.toLowerCase();
+    return bestClue.value.toLowerCase(Locale.ROOT);
   }
 
   /** Clears all clues. */

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 8f77969..524bee6 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.util;
 
 import java.text.NumberFormat;
+import java.util.Locale;
 
 public class TimingUtil {
 
@@ -45,7 +46,7 @@ public class TimingUtil {
       start += TIME_FACTOR[i] * elapsedTime[i];
     }
 
-    NumberFormat nf = NumberFormat.getInstance();
+    NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
     nf.setMinimumIntegerDigits(2);
     StringBuffer buf = new StringBuffer();
     for (int i = 0; i < elapsedTime.length; i++) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/URLUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index df16423..5183ba1 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -19,6 +19,7 @@ package org.apache.nutch.util;
 
 import java.net.MalformedURLException;
 import java.net.*;
+import java.util.Locale;
 import java.util.regex.Pattern;
 
 import org.apache.nutch.util.domain.DomainSuffix;
@@ -386,7 +387,7 @@ public class URLUtil {
    */
   public static String getHost(String url) {
     try {
-      return new URL(url).getHost().toLowerCase();
+      return new URL(url).getHost().toLowerCase(Locale.ROOT);
     } catch (MalformedURLException e) {
       return null;
     }
@@ -404,7 +405,7 @@ public class URLUtil {
   public static String getPage(String url) {
     try {
       // get the full url, and replace the query string with and empty string
-      url = url.toLowerCase();
+      url = url.toLowerCase(Locale.ROOT);
       String queryStr = new URL(url).getQuery();
       return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
     } catch (MalformedURLException e) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 57eb81e..7313a03 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -20,6 +20,7 @@ package org.apache.nutch.util.domain;
 import java.io.IOException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
+import java.util.Locale;
 
 import org.apache.gora.mapreduce.GoraMapper;
 import org.apache.gora.query.Query;
@@ -97,7 +98,7 @@ public class DomainStatistics extends Configured implements Tool {
       }
     }
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
     long start = System.currentTimeMillis();
     LOG.info("DomainStatistics: starting at " + sdf.format(start));
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
index ea19a8a..107771a 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
@@ -18,6 +18,7 @@ package org.apache.nutch.webui.client.impl;
 
 import java.io.Serializable;
 import java.text.MessageFormat;
+import java.util.Locale;
 
 import org.apache.commons.lang3.StringUtils;
 import org.apache.nutch.webui.client.model.JobConfig;
@@ -68,9 +69,8 @@ public class RemoteCommand implements Serializable {
   public String toString() {
     String statusInfo = StringUtils.EMPTY;
     if (jobInfo != null) {
-      statusInfo = MessageFormat.format("{0}", jobInfo.getState());
+      statusInfo = new MessageFormat("{0}", Locale.ROOT).format(jobInfo.getState());
     }
-    return MessageFormat.format("{0} status: {1}", jobConfig.getType(),
-        statusInfo);
+    return new MessageFormat("{0} status: {1}", Locale.ROOT).format(new Object[] {jobConfig.getType(), statusInfo});
   }
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
index f8db384..be427ef 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
@@ -37,6 +37,7 @@ import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.nio.charset.StandardCharsets;
 
 /** Adds metadata identifying the Creative Commons license used, if any. */
 public class CCParseFilter implements ParseFilter {
@@ -87,9 +88,9 @@ public class CCParseFilter implements ParseFilter {
               + " of " + base);
         }
         page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
-            ByteBuffer.wrap(licenseUrl.getBytes()));
+            ByteBuffer.wrap(licenseUrl.getBytes(StandardCharsets.UTF_8)));
         page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
-            ByteBuffer.wrap(licenseLocation.getBytes()));
+            ByteBuffer.wrap(licenseLocation.getBytes(StandardCharsets.UTF_8)));
       }
 
       if (walker.workType != null) {
@@ -97,7 +98,7 @@ public class CCParseFilter implements ParseFilter {
           LOG.debug("CC: found " + walker.workType + " in " + base);
         }
         page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
-            ByteBuffer.wrap(walker.workType.getBytes()));
+            ByteBuffer.wrap(walker.workType.getBytes(StandardCharsets.UTF_8)));
       }
 
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
index 25149be..9e2e75b 100644
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -30,6 +30,7 @@ import java.lang.CharSequence;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map.Entry;
+import java.util.Locale;
 
 /**
  * Indexing filter that offers an option to either index all inbound anchor text
@@ -97,7 +98,7 @@ public class AnchorIndexingFilter implements IndexingFilter {
       if (deduplicate) {
         if (set == null)
           set = new HashSet<String>();
-        String lcAnchor = anchor.toLowerCase();
+        String lcAnchor = anchor.toLowerCase(Locale.ROOT);
 
         // Check if already processed the current anchor
         if (!set.contains(lcAnchor)) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
index 6db3bea..eb1454b 100644
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.html;
 
 import java.io.ByteArrayInputStream;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Scanner;
@@ -67,7 +68,7 @@ public class HtmlIndexingFilter implements IndexingFilter {
                 LOG.info("Html indexing for: " + url.toString());
             }
             ByteArrayInputStream arrayInputStream = new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining());
-            Scanner scanner = new Scanner(arrayInputStream);
+            Scanner scanner = new Scanner(arrayInputStream, StandardCharsets.UTF_8.name());
             scanner.useDelimiter("\\Z");//To read all scanner content in one String
             String data = "";
             if (scanner.hasNext()) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
index 2e9da51..206831d 100644
--- a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
+++ b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -26,6 +26,7 @@ import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Test;
 
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import static org.junit.Assert.*;
 
@@ -81,7 +82,7 @@ public class TestMoreIndexingFilter {
     filter.setConf(conf);
     WebPage page = WebPage.newBuilder().build();
     String url = "http://www.example.com/";
-    page.setContent(ByteBuffer.wrap("text".getBytes()));
+    page.setContent(ByteBuffer.wrap("text".getBytes(StandardCharsets.UTF_8)));
     page.setTitle(new Utf8("title"));
     page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
     NutchDocument doc = filter.filter(new NutchDocument(), url, page);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index ee0560b..f3af6a9 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -39,6 +39,7 @@ import org.w3c.dom.Node;
 
 import java.lang.CharSequence;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 
 /**
@@ -67,7 +68,7 @@ public class HTMLLanguageParser implements ParseFilter {
         String[] values = p.getProperty(key).split(",", -1);
         LANGUAGES_MAP.put(key, key);
         for (int i = 0; i < values.length; i++) {
-          LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+          LANGUAGES_MAP.put(values[i].trim().toLowerCase(Locale.ROOT), key);
         }
       }
     } catch (Exception e) {
@@ -115,7 +116,7 @@ public class HTMLLanguageParser implements ParseFilter {
 
     if (lang != null) {
       page.getMetadata().put(new Utf8(Metadata.LANGUAGE),
-          ByteBuffer.wrap(lang.getBytes()));
+          ByteBuffer.wrap(lang.getBytes(StandardCharsets.UTF_8)));
       return parse;
     }
 
@@ -255,7 +256,7 @@ public class HTMLLanguageParser implements ParseFilter {
                 Node attrnode = attrs.item(i);
                 if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
                   if ("content-language".equals(attrnode.getNodeValue()
-                      .toLowerCase())) {
+                      .toLowerCase(Locale.ROOT))) {
                     Node valueattr = attrs.getNamedItem("content");
                     if (valueattr != null) {
                       httpEquiv = parseLanguage(valueattr.getNodeValue());
@@ -296,7 +297,7 @@ public class HTMLLanguageParser implements ParseFilter {
         code = langs[i].split("-")[0];
         code = code.split("_")[0];
         // Find the ISO 639 code
-        language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+        language = (String) LANGUAGES_MAP.get(code.toLowerCase(Locale.ROOT));
         i++;
       }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
index c98c00f..1432999 100644
--- a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
@@ -31,6 +31,7 @@ import org.junit.Test;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
@@ -107,7 +108,7 @@ public class TestHTMLLanguageParser {
       long total = 0;
       LanguageIdentifier identifier;
       BufferedReader in = new BufferedReader(new InputStreamReader(this
-          .getClass().getResourceAsStream("test-referencial.txt")));
+          .getClass().getResourceAsStream("test-referencial.txt"), StandardCharsets.UTF_8));
       String line = null;
       while ((line = in.readLine()) != null) {
         String[] tokens = line.split(";");
@@ -149,7 +150,7 @@ public class TestHTMLLanguageParser {
   private WebPage getPage(String text) {
     WebPage page = WebPage.newBuilder().build();
     page.setBaseUrl(BASE);
-    page.setContent(ByteBuffer.wrap(text.getBytes()));
+    page.setContent(ByteBuffer.wrap(text.getBytes(StandardCharsets.UTF_8)));
     page.setContentType(new Utf8("text/html"));
     page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8,
         new Utf8("text/html"));

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index d0a4726..0a6121b 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.Reader;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
@@ -516,7 +517,7 @@ public abstract class HttpBase implements Protocol {
       System.out.println("Content Length: "
           + content.getMetadata().get(Response.CONTENT_LENGTH));
       System.out.println("Content:");
-      String text = new String(content.getContent());
+      String text = new String(content.getContent(), StandardCharsets.UTF_8);
       System.out.println(text);
     }
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 1d6ea55..bd64d76 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -28,6 +28,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.net.URL;
+import java.util.Locale;
 
 /**
  * This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -50,9 +51,9 @@ public class HttpRobotRulesParser extends RobotRulesParser {
 
   /** Compose unique key to store and access robot rules in cache for given URL */
   protected static String getCacheKey(URL url) {
-    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+    String protocol = url.getProtocol().toLowerCase(Locale.ROOT); // normalize to lower
                                                        // case
-    String host = url.getHost().toLowerCase(); // normalize to lower case
+    String host = url.getHost().toLowerCase(Locale.ROOT); // normalize to lower case
     int port = url.getPort();
     if (port == -1) {
       port = url.getDefaultPort();

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 47b41a3..8d033e9 100644
--- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.protocol.http.api;
 
+import java.nio.charset.StandardCharsets;
+
 import org.junit.Before;
 import org.junit.Test;
 
@@ -80,7 +82,7 @@ public class TestRobotRulesParser {
    */
   @Test
   public void testRobotsAgent() {
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
         CONTENT_TYPE, SINGLE_AGENT);
 
     for (int counter = 0; counter < TEST_PATHS.length; counter++) {
@@ -91,7 +93,7 @@ public class TestRobotRulesParser {
           rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
     }
 
-    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
         CONTENT_TYPE, MULTIPLE_AGENTS);
 
     for (int counter = 0; counter < TEST_PATHS.length; counter++) {
@@ -112,13 +114,13 @@ public class TestRobotRulesParser {
   public void testCrawlDelay() {
     // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
     // returned by the parser
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
         CONTENT_TYPE, SINGLE_AGENT);
     assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
         (rules.getCrawlDelay() == 10000));
 
     // for UNKNOWN_AGENT, the default crawl delay must be returned.
-    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
         CONTENT_TYPE, UNKNOWN_AGENT);
     assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
         (rules.getCrawlDelay() == Long.MIN_VALUE));

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 40ba266..d374e95 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -22,8 +22,10 @@ import java.io.Reader;
 import java.io.FileReader;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.ArrayList;
 
@@ -82,7 +84,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
    */
   public RegexURLFilterBase(File filename) throws IOException,
       IllegalArgumentException {
-    this(new FileReader(filename));
+    this(new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8));
   }
 
   /**
@@ -245,7 +247,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
   public static void main(RegexURLFilterBase filter, String args[])
       throws IOException, IllegalArgumentException {
 
-    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
     String line;
     while ((line = in.readLine()) != null) {
       String out = filter.filter(line);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index 2b40b48..ae4660f 100644
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -21,6 +21,10 @@ import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.FileInputStream;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -57,8 +61,8 @@ public abstract class RegexURLFilterBaseTest {
 
   protected void bench(int loops, String file) {
     try {
-      bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+      bench(loops, new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8),
+          new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8));
     } catch (Exception e) {
       fail(e.toString());
     }
@@ -81,8 +85,8 @@ public abstract class RegexURLFilterBaseTest {
 
   protected void test(String file) {
     try {
-      test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
-          new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+      test(new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8),
+          new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8));
     } catch (Exception e) {
       fail(e.toString());
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
index f71c5ab..00fa30d 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -20,6 +20,7 @@ package org.apache.nutch.microformats.reltag;
 import java.net.URL;
 import java.net.URLDecoder;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -171,7 +172,7 @@ public class RelTagParser implements ParseFilter {
       sb.append(iter.next());
       sb.append("\t");
     }
-    ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
+    ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes(StandardCharsets.UTF_8));
     page.getMetadata().put(new Utf8(REL_TAG), bb);
     return parse;
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
index 064b46b..66964de 100644
--- a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
+++ b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
@@ -34,6 +34,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import static org.junit.Assert.assertEquals;
 
@@ -90,7 +91,7 @@ public class TestRelTagParser {
     ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag"));
     byte[] byteArray = new byte[bbuf.remaining()];
     bbuf.get(byteArray);
-    String s = new String(byteArray);
+    String s = new String(byteArray, StandardCharsets.UTF_8);
     // bbuf.flip();
     assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
         expectedRelTags, s);

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 3ba3716..8e079fb 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -22,6 +22,7 @@ import java.net.MalformedURLException;
 import java.util.Collection;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Locale;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
@@ -320,7 +321,7 @@ public class DOMContentUtils {
 
       if (nodeType == Node.ELEMENT_NODE) {
 
-        nodeName = nodeName.toLowerCase();
+        nodeName = nodeName.toLowerCase(Locale.ROOT);
         LinkParams params = linkParams.get(nodeName);
         if (params != null) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index 159aa76..3e066c4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.parse.html;
 
 import java.net.URL;
+import java.util.Locale;
 
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.w3c.dom.*;
@@ -64,7 +65,7 @@ public class HTMLMetaProcessor {
         // Retrieves name, http-equiv and content attribues
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
-          String attrName = attr.getNodeName().toLowerCase();
+          String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
           if (attrName.equals("name")) {
             nameNode = attr;
           } else if (attrName.equals("http-equiv")) {
@@ -76,12 +77,12 @@ public class HTMLMetaProcessor {
 
         if (nameNode != null) {
           if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();
+            String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
 
               if (contentNode != null) {
-                String directives = contentNode.getNodeValue().toLowerCase();
+                String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT);
                 int index = directives.indexOf("none");
 
                 if (index >= 0) {
@@ -116,11 +117,11 @@ public class HTMLMetaProcessor {
 
         if (equivNode != null) {
           if (contentNode != null) {
-            String name = equivNode.getNodeValue().toLowerCase();
+            String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
             String content = contentNode.getNodeValue();
             metaTags.getHttpEquivTags().setProperty(name, content);
             if ("pragma".equals(name)) {
-              content = content.toLowerCase();
+              content = content.toLowerCase(Locale.ROOT);
               int index = content.indexOf("no-cache");
               if (index >= 0)
                 metaTags.setNoCache();
@@ -140,7 +141,7 @@ public class HTMLMetaProcessor {
               }
               URL refreshUrl = null;
               if (metaTags.getRefresh() && idx != -1) { // set the URL
-                idx = content.toLowerCase().indexOf("url=");
+                idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
                 if (idx == -1) { // assume a mis-formatted entry with just the
                                  // url
                   idx = content.indexOf(';') + 1;

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 5440ec7..3255dcc 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.util.NutchConfiguration;
 
 import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
@@ -182,7 +183,7 @@ public class TestDOMContentUtils {
       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
       try {
         parser.parse(
-            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+            new InputSource(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8))),
             node);
         testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
       } catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
index 8c58ca4..f390041 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -20,6 +20,7 @@ package org.apache.nutch.parse.html;
 import org.apache.nutch.parse.HTMLMetaTags;
 
 import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
 import java.net.URL;
 
 import org.cyberneko.html.parsers.*;
@@ -123,7 +124,7 @@ public class TestRobotsMetaProcessor {
     }
 
     for (int i = 0; i < tests.length; i++) {
-      byte[] bytes = tests[i].getBytes();
+      byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
 
       DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index fc2e930..a481755 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -26,6 +26,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
+import java.util.Locale;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -150,7 +151,7 @@ public class JSParseFilter implements ParseFilter, Parser {
             links = getJSLinks(anode.getNodeValue(), "", base);
           } else if (anode.getNodeName().equalsIgnoreCase("href")) {
             String val = anode.getNodeValue();
-            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+            if (val != null && val.toLowerCase(Locale.ROOT).indexOf("javascript:") != -1) {
               links = getJSLinks(val, "", base);
             }
           }
@@ -178,7 +179,7 @@ public class JSParseFilter implements ParseFilter, Parser {
   public Parse getParse(String url, WebPage page) {
     String type = TableUtil.toString(page.getContentType());
     if (type != null && !type.trim().equals("")
-        && !type.toLowerCase().startsWith("application/x-javascript"))
+        && !type.toLowerCase(Locale.ROOT).startsWith("application/x-javascript"))
       return ParseStatusUtils.getEmptyParse(
           ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
               + type + "'", getConf());

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 2aac3c6..f61838c 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -27,6 +27,7 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.Map.Entry;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.avro.util.Utf8;
 import org.apache.commons.logging.Log;
@@ -83,7 +84,7 @@ public class MetaTagsParser implements ParseFilter {
         LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
       }
       metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag),
-          ByteBuffer.wrap(value.getBytes()));
+          ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8)));
     }
   }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
index 1b42263..a13eac7 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
@@ -48,6 +48,7 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.util.Map;
+import java.util.Locale;
 
 public class TestMetaTagsParser {
 
@@ -129,7 +130,7 @@ public class TestMetaTagsParser {
         // Retrieves name, http-equiv and content attribues
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
-          String attrName = attr.getNodeName().toLowerCase();
+          String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
           if (attrName.equals("name")) {
             nameNode = attr;
           } else if (attrName.equals("http-equiv")) {
@@ -140,14 +141,14 @@ public class TestMetaTagsParser {
         }
         if (nameNode != null) {
           if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();
+            String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
           }
         }
 
         if (equivNode != null) {
           if (contentNode != null) {
-            String name = equivNode.getNodeValue().toLowerCase();
+            String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
             String content = contentNode.getNodeValue();
             metaTags.getHttpEquivTags().setProperty(name, content);
           }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index ae1cb44..ee95862 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -22,6 +22,8 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.Locale;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.Outlink;
@@ -321,7 +323,7 @@ public class DOMContentUtils {
 
       if (nodeType == Node.ELEMENT_NODE) {
 
-        nodeName = nodeName.toLowerCase();
+        nodeName = nodeName.toLowerCase(Locale.ROOT);
         LinkParams params = (LinkParams) linkParams.get(nodeName);
         if (params != null) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 294bde9..0818eff 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.parse.tika;
 
 import java.net.URL;
+import java.util.Locale;
 
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.w3c.dom.*;
@@ -64,7 +65,7 @@ public class HTMLMetaProcessor {
         // Retrieves name, http-equiv and content attribues
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
-          String attrName = attr.getNodeName().toLowerCase();
+          String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
           if (attrName.equals("name")) {
             nameNode = attr;
           } else if (attrName.equals("http-equiv")) {
@@ -76,12 +77,12 @@ public class HTMLMetaProcessor {
 
         if (nameNode != null) {
           if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();
+            String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
 
               if (contentNode != null) {
-                String directives = contentNode.getNodeValue().toLowerCase();
+                String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT);
                 int index = directives.indexOf("none");
 
                 if (index >= 0) {
@@ -116,11 +117,11 @@ public class HTMLMetaProcessor {
 
         if (equivNode != null) {
           if (contentNode != null) {
-            String name = equivNode.getNodeValue().toLowerCase();
+            String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
             String content = contentNode.getNodeValue();
             metaTags.getHttpEquivTags().setProperty(name, content);
             if ("pragma".equals(name)) {
-              content = content.toLowerCase();
+              content = content.toLowerCase(Locale.ROOT);
               int index = content.indexOf("no-cache");
               if (index >= 0)
                 metaTags.setNoCache();
@@ -140,7 +141,7 @@ public class HTMLMetaProcessor {
               }
               URL refreshUrl = null;
               if (metaTags.getRefresh() && idx != -1) { // set the URL
-                idx = content.toLowerCase().indexOf("url=");
+                idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
                 if (idx == -1) { // assume a mis-formatted entry with just the
                                  // url
                   idx = content.indexOf(';') + 1;

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
index 957d664..06bea9f 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
@@ -39,6 +39,7 @@ import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
 
 import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.StringTokenizer;
@@ -210,7 +211,7 @@ public class DOMContentUtilsTest {
       // to add once available in Tika
       // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
       try {
-        parser.parse(new ByteArrayInputStream(testPages[i].getBytes()),
+        parser.parse(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8)),
             domhandler, tikamd, context);
         testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
       } catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 3a1204c..350be0e 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -23,6 +23,7 @@ import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
@@ -72,14 +73,14 @@ public class TestImageMetadata {
       ByteBuffer bbufW = page.getMetadata().get(new Utf8("width"));
       byte[] byteArrayW = new byte[bbufW.remaining()];
       bbufW.get(byteArrayW);
-      String width = new String(byteArrayW);
+      String width = new String(byteArrayW, StandardCharsets.UTF_8);
       assertEquals("121", width);
 
       // assert height
       ByteBuffer bbufH = page.getMetadata().get(new Utf8("height"));
       byte[] byteArrayH = new byte[bbufH.remaining()];
       bbufH.get(byteArrayH);
-      String height = new String(byteArrayH);
+      String height = new String(byteArrayH, StandardCharsets.UTF_8);
       assertEquals("48", height);
     }
   }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index e7139cc..0695439 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -28,6 +28,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import java.util.HashSet;
 
@@ -218,7 +219,7 @@ public class File implements Protocol {
     }
 
     if (dumpContent) {
-      System.out.print(new String(content.getContent()));
+      System.out.print(new String(content.getContent(), StandardCharsets.UTF_8));
     }
 
     file = null;

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index 410f8e6..0e5f2b0 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -20,6 +20,7 @@ package org.apache.nutch.protocol.file;
 // JDK imports
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.net.URL;
 
 import org.apache.hadoop.conf.Configuration;
@@ -275,7 +276,7 @@ public class FileResponse {
 
     x.append("</pre></body></html>\n");
 
-    return new String(x).getBytes();
+    return new String(x).getBytes(StandardCharsets.UTF_8);
   }
 
 }

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index 2478b0a..ffa2091 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.net.InetAddress;
 import java.net.Socket;
 import java.util.List;
@@ -330,7 +331,7 @@ public class Client extends FTP {
           + ((path == null) ? "" : path));
 
     BufferedReader reader = new BufferedReader(new InputStreamReader(
-        socket.getInputStream()));
+        socket.getInputStream(), StandardCharsets.UTF_8));
 
     // force-close data channel socket, when download limit is reached
     // boolean mandatory_close = false;

http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index ccfae0a..3f3a7e8 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -31,6 +31,7 @@ import java.io.IOException;
 import java.net.URL;
 import java.util.Collection;
 import java.util.HashSet;
+import java.nio.charset.StandardCharsets;
 
 /**
  * This class is a protocol plugin used for ftp: scheme. It creates
@@ -243,7 +244,7 @@ public class Ftp implements Protocol {
     System.err.println("Last-Modified: "
         + content.getMetadata().get(Response.LAST_MODIFIED));
     if (dumpContent) {
-      System.out.print(new String(content.getContent()));
+      System.out.print(new String(content.getContent(), StandardCharsets.UTF_8));
     }
 
     ftp = null;