You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/08/30 16:54:43 UTC
[2/3] nutch git commit: NUTCH-2264 Forbidden APIs are Checked at Build
NUTCH-2264 Forbidden APIs are Checked at Build
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a671540a
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a671540a
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a671540a
Branch: refs/heads/2.x
Commit: a671540a94d8afafd72a09396c97d9ede43a7ea2
Parents: 9ecdc9b
Author: Furkan KAMACI <fu...@gmail.com>
Authored: Mon Aug 29 18:24:50 2016 +0300
Committer: Furkan KAMACI <fu...@gmail.com>
Committed: Mon Aug 29 18:44:25 2016 +0300
----------------------------------------------------------------------
build.xml | 29 ++++++++++++++++++++
ivy/ivy.xml | 2 ++
.../org/apache/nutch/api/impl/JobWorker.java | 9 +++---
.../nutch/api/resources/AdminResource.java | 3 +-
.../nutch/api/resources/SeedResource.java | 9 +++---
.../org/apache/nutch/crawl/DbUpdaterJob.java | 3 +-
.../org/apache/nutch/crawl/GeneratorJob.java | 3 +-
.../org/apache/nutch/crawl/InjectorJob.java | 5 ++--
.../org/apache/nutch/fetcher/FetcherJob.java | 3 +-
.../apache/nutch/fetcher/FetcherReducer.java | 8 +++---
.../apache/nutch/host/HostDbUpdateReducer.java | 5 ++--
.../org/apache/nutch/host/HostInjectorJob.java | 3 +-
.../org/apache/nutch/net/URLFilterChecker.java | 5 ++--
.../apache/nutch/net/URLNormalizerChecker.java | 5 ++--
src/java/org/apache/nutch/parse/ParseUtil.java | 8 ++++--
src/java/org/apache/nutch/parse/ParserJob.java | 3 +-
src/java/org/apache/nutch/protocol/Content.java | 3 +-
.../apache/nutch/protocol/RobotRulesParser.java | 6 ++--
src/java/org/apache/nutch/tools/Benchmark.java | 3 +-
src/java/org/apache/nutch/tools/DmozParser.java | 5 ++--
.../org/apache/nutch/tools/ResolveUrls.java | 8 +++---
.../apache/nutch/tools/arc/ArcRecordReader.java | 3 +-
.../apache/nutch/tools/proxy/FakeHandler.java | 11 ++++----
src/java/org/apache/nutch/util/Bytes.java | 3 +-
.../org/apache/nutch/util/EncodingDetector.java | 7 +++--
src/java/org/apache/nutch/util/TimingUtil.java | 3 +-
src/java/org/apache/nutch/util/URLUtil.java | 5 ++--
.../nutch/util/domain/DomainStatistics.java | 3 +-
.../nutch/webui/client/impl/RemoteCommand.java | 6 ++--
.../creativecommons/nutch/CCParseFilter.java | 7 +++--
.../indexer/anchor/AnchorIndexingFilter.java | 3 +-
.../nutch/indexer/html/HtmlIndexingFilter.java | 3 +-
.../indexer/more/TestMoreIndexingFilter.java | 3 +-
.../nutch/analysis/lang/HTMLLanguageParser.java | 9 +++---
.../analysis/lang/TestHTMLLanguageParser.java | 5 ++--
.../nutch/protocol/http/api/HttpBase.java | 3 +-
.../protocol/http/api/HttpRobotRulesParser.java | 5 ++--
.../protocol/http/api/TestRobotRulesParser.java | 10 ++++---
.../nutch/urlfilter/api/RegexURLFilterBase.java | 6 ++--
.../urlfilter/api/RegexURLFilterBaseTest.java | 12 +++++---
.../nutch/microformats/reltag/RelTagParser.java | 3 +-
.../microformats/reltag/TestRelTagParser.java | 3 +-
.../nutch/parse/html/DOMContentUtils.java | 3 +-
.../nutch/parse/html/HTMLMetaProcessor.java | 13 +++++----
.../nutch/parse/html/TestDOMContentUtils.java | 3 +-
.../parse/html/TestRobotsMetaProcessor.java | 3 +-
.../apache/nutch/parse/js/JSParseFilter.java | 5 ++--
.../nutch/parse/metatags/MetaTagsParser.java | 3 +-
.../parse/metatags/TestMetaTagsParser.java | 7 +++--
.../nutch/parse/tika/DOMContentUtils.java | 4 ++-
.../nutch/parse/tika/HTMLMetaProcessor.java | 13 +++++----
.../nutch/parse/tika/DOMContentUtilsTest.java | 3 +-
.../nutch/parse/tika/TestImageMetadata.java | 5 ++--
.../org/apache/nutch/protocol/file/File.java | 3 +-
.../nutch/protocol/file/FileResponse.java | 3 +-
.../org/apache/nutch/protocol/ftp/Client.java | 3 +-
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 3 +-
.../apache/nutch/protocol/ftp/FtpResponse.java | 5 ++--
.../nutch/protocol/ftp/FtpRobotRulesParser.java | 5 ++--
.../nutch/protocol/http/HttpResponse.java | 5 ++--
.../httpclient/HttpBasicAuthentication.java | 5 ++--
.../org/apache/nutch/protocol/sftp/Sftp.java | 3 +-
.../scoring/opic/TestOPICScoringFilter.java | 8 +++++-
.../nutch/collection/TestSubcollection.java | 3 +-
.../nutch/urlfilter/domain/DomainURLFilter.java | 8 ++++--
.../nutch/urlfilter/prefix/PrefixURLFilter.java | 3 +-
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 15 ++++++----
.../urlnormalizer/basic/BasicURLNormalizer.java | 12 +++-----
.../urlnormalizer/regex/RegexURLNormalizer.java | 7 +++--
.../regex/TestRegexURLNormalizer.java | 5 ++--
.../apache/nutch/parse/TestSitemapParser.java | 20 ++------------
.../apache/nutch/plugin/TestPluginSystem.java | 12 ++++----
.../apache/nutch/util/TestEncodingDetector.java | 12 ++++----
.../org/apache/nutch/util/TestGZIPUtils.java | 25 +++++++++--------
.../org/apache/nutch/util/TestNodeWalker.java | 3 +-
75 files changed, 286 insertions(+), 185 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 061d0b7..f051838 100644
--- a/build.xml
+++ b/build.xml
@@ -63,6 +63,35 @@
</path>
<!-- ====================================================== -->
+ <!-- Forbidden APIs Targets -->
+ <!-- ====================================================== -->
+
+ <!-- will be used by forbiddenapis -->
+ <path id="all-lib-classpath">
+ <fileset dir="${build.lib.dir}">
+ <include name="**/*.jar" />
+ </fileset>
+ <fileset dir="${runtime.dir}/local/lib">
+ <include name="**/*.jar" />
+ </fileset>
+ <fileset dir="${runtime.dir}/local/plugins">
+ <include name="**/*.jar" />
+ </fileset>
+ </path>
+
+ <!-- forbiddenapis target -->
+ <target name="precommit" depends="runtime, test">
+ <!-- forbiddenapis task definition -->
+ <taskdef name="forbiddenapis" classname="de.thetaphi.forbiddenapis.ant.AntTask" classpath="${build.lib.dir}/forbiddenapis-2.2.jar"/>
+
+ <forbiddenapis classpathref="all-lib-classpath" dir="${build.dir}" targetVersion="${javac.version}">
+ <bundledsignatures name="jdk-unsafe"/>
+ <bundledsignatures name="jdk-deprecated"/>
+ <bundledsignatures name="jdk-non-portable"/>
+ </forbiddenapis>
+ </target>
+
+ <!-- ====================================================== -->
<!-- Stuff needed by all targets -->
<!-- ====================================================== -->
<target name="init" depends="ivy-init" description="--> stuff required by all targets">
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index db42162..e173e71 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -81,6 +81,8 @@
<dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3" conf="*->default" />
<dependency org="org.restlet.jee" name="org.restlet.ext.crypto" rev="2.2.3" conf="*->default" />
+ <dependency org="de.thetaphi" name="forbiddenapis" rev="2.2" conf="*->default"/>
+
<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11" conf="*->default" />
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/impl/JobWorker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/impl/JobWorker.java b/src/java/org/apache/nutch/api/impl/JobWorker.java
index 9c7c5c2..8ac78cc 100644
--- a/src/java/org/apache/nutch/api/impl/JobWorker.java
+++ b/src/java/org/apache/nutch/api/impl/JobWorker.java
@@ -17,6 +17,7 @@
package org.apache.nutch.api.impl;
import java.text.MessageFormat;
+import java.util.Locale;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.api.model.request.JobConfig;
@@ -49,11 +50,11 @@ public class JobWorker implements Runnable {
private String generateId() {
if (jobConfig.getCrawlId() == null) {
- return MessageFormat.format("{0}-{1}-{2}", jobConfig.getConfId(),
- jobConfig.getType(), String.valueOf(hashCode()));
+ return new MessageFormat("{0}-{1}-{2}", Locale.ROOT)
+ .format(new Object[] {jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())});
}
- return MessageFormat.format("{0}-{1}-{2}-{3}", jobConfig.getCrawlId(),
- jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode()));
+ return new MessageFormat("{0}-{1}-{2}-{3}", Locale.ROOT)
+ .format(new Object[] {jobConfig.getCrawlId(), jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())});
}
@Override
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/AdminResource.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/resources/AdminResource.java b/src/java/org/apache/nutch/api/resources/AdminResource.java
index cfbf8d5..6e93f11 100644
--- a/src/java/org/apache/nutch/api/resources/AdminResource.java
+++ b/src/java/org/apache/nutch/api/resources/AdminResource.java
@@ -18,6 +18,7 @@ package org.apache.nutch.api.resources;
import java.text.MessageFormat;
import java.util.Date;
+import java.util.Locale;
import java.util.concurrent.TimeUnit;
import javax.ws.rs.GET;
@@ -72,7 +73,7 @@ public class AdminResource extends AbstractResource {
}
scheduleServerStop();
- return MessageFormat.format("Stopping in {0} seconds.", DELAY_SEC);
+ return new MessageFormat("Stopping in {0} seconds.", Locale.ROOT).format(DELAY_SEC);
}
private void scheduleServerStop() {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/SeedResource.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/api/resources/SeedResource.java b/src/java/org/apache/nutch/api/resources/SeedResource.java
index d7439e0..472c842 100644
--- a/src/java/org/apache/nutch/api/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/api/resources/SeedResource.java
@@ -20,9 +20,10 @@ import static javax.ws.rs.core.Response.status;
import java.io.BufferedWriter;
import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
+import java.io.OutputStreamWriter;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.Collection;
import javax.ws.rs.Consumes;
@@ -90,9 +91,7 @@ public class SeedResource extends AbstractResource {
private BufferedWriter getWriter(File seedFile) {
try {
- return new BufferedWriter(new FileWriter(seedFile));
- } catch (FileNotFoundException e) {
- throw handleException(e);
+ return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seedFile), StandardCharsets.UTF_8));
} catch (IOException e) {
throw handleException(e);
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
index 4b1618c..3885b68 100644
--- a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
+++ b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
@@ -19,6 +19,7 @@ package org.apache.nutch.crawl;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
import org.apache.avro.util.Utf8;
@@ -129,7 +130,7 @@ public class DbUpdaterJob extends NutchTool implements Tool {
private int updateTable(String crawlId, String batchId) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("DbUpdaterJob: starting at " + sdf.format(start));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/GeneratorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index e06a192..f47637f 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -25,6 +25,7 @@ import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.Collection;
+import java.util.Locale;
import org.apache.hadoop.mapreduce.Job;
import org.slf4j.Logger;
@@ -255,7 +256,7 @@ public class GeneratorJob extends NutchTool implements Tool {
public String generate(long topN, long curTime, boolean filter, boolean norm,
boolean sitemap) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("GeneratorJob: starting at {}", sdf.format(start));
LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/InjectorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/crawl/InjectorJob.java b/src/java/org/apache/nutch/crawl/InjectorJob.java
index 5094b0f..df91a73 100644
--- a/src/java/org/apache/nutch/crawl/InjectorJob.java
+++ b/src/java/org/apache/nutch/crawl/InjectorJob.java
@@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.*;
@@ -181,7 +182,7 @@ public class InjectorJob extends NutchTool implements Tool {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
row.getMetadata().put(new Utf8(keymd),
- ByteBuffer.wrap(valuemd.getBytes()));
+ ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8)));
}
if (customScore != -1)
@@ -260,7 +261,7 @@ public class InjectorJob extends NutchTool implements Tool {
}
public void inject(Path urlDir) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("InjectorJob: starting at " + sdf.format(start));
LOG.info("InjectorJob: Injecting urlDir: " + urlDir);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index 268d9f6..a7f3df8 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java
@@ -22,6 +22,7 @@ import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
+import java.util.Locale;
import org.apache.avro.util.Utf8;
import org.apache.gora.filter.FilterOp;
import org.apache.gora.filter.MapFieldValueFilter;
@@ -278,7 +279,7 @@ public class FetcherJob extends NutchTool implements Tool {
public int fetch(String batchId, int threads, boolean shouldResume,
int numTasks, boolean stmDetect, boolean sitemap) throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("FetcherJob: starting at " + sdf.format(start));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 8ee7477..68b982d 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
@@ -106,7 +106,7 @@ public class FetcherReducer extends
LOG.warn("Cannot parse url: " + url, e);
return null;
}
- final String proto = u.getProtocol().toLowerCase();
+ final String proto = u.getProtocol().toLowerCase(Locale.ROOT);
String host;
if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
@@ -131,7 +131,7 @@ public class FetcherReducer extends
host = u.toExternalForm();
}
}
- queueID = proto + "://" + host.toLowerCase();
+ queueID = proto + "://" + host.toLowerCase(Locale.ROOT);
return new FetchItem(url, page, u, queueID);
}
@@ -639,8 +639,8 @@ public class FetcherReducer extends
}
if (ignoreExternalLinks) {
- String toHost = new URL(newUrl).getHost().toLowerCase();
- String fromHost = new URL(url).getHost().toLowerCase();
+ String toHost = new URL(newUrl).getHost().toLowerCase(Locale.ROOT);
+ String fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT);
if (toHost == null || !toHost.equals(fromHost)) {
// external links
return;
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
index 933f546..3043543 100644
--- a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
+++ b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.URLUtil;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.Set;
/**
@@ -78,10 +79,10 @@ public class HostDbUpdateReducer extends
// output host data
Host host = new Host();
host.getMetadata().put(new Utf8("p"),
- ByteBuffer.wrap(Integer.toString(numPages).getBytes()));
+ ByteBuffer.wrap(Integer.toString(numPages).getBytes(StandardCharsets.UTF_8)));
if (numFetched > 0) {
host.getMetadata().put(new Utf8("f"),
- ByteBuffer.wrap(Integer.toString(numFetched).getBytes()));
+ ByteBuffer.wrap(Integer.toString(numFetched).getBytes(StandardCharsets.UTF_8)));
}
for (String inlink : inlinkCount.getKeys()) {
host.getInlinks().put(new Utf8(inlink),
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostInjectorJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/host/HostInjectorJob.java b/src/java/org/apache/nutch/host/HostInjectorJob.java
index 12cdf28..83f247c 100644
--- a/src/java/org/apache/nutch/host/HostInjectorJob.java
+++ b/src/java/org/apache/nutch/host/HostInjectorJob.java
@@ -40,6 +40,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.*;
/**
@@ -124,7 +125,7 @@ public class HostInjectorJob implements Tool {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
host.getMetadata().put(new Utf8(keymd),
- ByteBuffer.wrap(valuemd.getBytes()));
+ ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8)));
}
String hostname;
if (url.indexOf("://") > -1) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLFilterChecker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 21ffd03..ee4daf7 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration;
import java.io.BufferedReader;
import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
/**
* Checks one given filter or all filters.
@@ -71,7 +72,7 @@ public class URLFilterChecker {
System.out.println("Checking URLFilter " + filterName);
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line;
while ((line = in.readLine()) != null) {
String out = filter.filter(line);
@@ -88,7 +89,7 @@ public class URLFilterChecker {
private void checkAll() throws Exception {
System.out.println("Checking combination of all URLFilters available");
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line;
while ((line = in.readLine()) != null) {
URLFilters filters = new URLFilters(this.conf);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index d8f1c6e..b1ec60f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration;
import java.io.BufferedReader;
import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
/**
* Checks one given normalizer or all normalizers.
@@ -66,7 +67,7 @@ public class URLNormalizerChecker {
System.out.println("Checking URLNormalizer " + normalizerName);
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line;
while ((line = in.readLine()) != null) {
String out = normalizer.normalize(line, scope);
@@ -77,7 +78,7 @@ public class URLNormalizerChecker {
private void checkAll(String scope) throws Exception {
System.out.println("Checking combination of all URLNormalizers available");
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line;
URLNormalizers normalizers = new URLNormalizers(conf, scope);
while ((line = in.readLine()) != null) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParseUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java
index 8a37063..a38fb0a 100644
--- a/src/java/org/apache/nutch/parse/ParseUtil.java
+++ b/src/java/org/apache/nutch/parse/ParseUtil.java
@@ -45,6 +45,8 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
@@ -243,7 +245,7 @@ public class ParseUtil extends Configured {
for (Map.Entry<String, String[]> metadata : metaDatas) {
System.out.println();
newRow.getMetadata().put(new Utf8(metadata.getKey()),
- ByteBuffer.wrap(metadata.getValue()[0].getBytes()));
+ ByteBuffer.wrap(metadata.getValue()[0].getBytes(StandardCharsets.UTF_8)));
}
int changeFrequency = calculateFetchInterval(
@@ -362,7 +364,7 @@ public class ParseUtil extends Configured {
String fromHost;
if (ignoreExternalLinks) {
try {
- fromHost = new URL(url).getHost().toLowerCase();
+ fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT);
} catch (final MalformedURLException e) {
fromHost = null;
}
@@ -382,7 +384,7 @@ public class ParseUtil extends Configured {
String toHost;
if (ignoreExternalLinks) {
try {
- toHost = new URL(toUrl).getHost().toLowerCase();
+ toHost = new URL(toUrl).getHost().toLowerCase(Locale.ROOT);
} catch (final MalformedURLException e) {
toHost = null;
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParserJob.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/parse/ParserJob.java b/src/java/org/apache/nutch/parse/ParserJob.java
index a021879..9762a00 100644
--- a/src/java/org/apache/nutch/parse/ParserJob.java
+++ b/src/java/org/apache/nutch/parse/ParserJob.java
@@ -21,6 +21,7 @@ import java.nio.ByteBuffer;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Map;
import org.apache.avro.util.Utf8;
@@ -304,7 +305,7 @@ public class ParserJob extends NutchTool implements Tool {
boolean sitemap)
throws Exception {
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("ParserJob: starting at {}", sdf.format(start));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/Content.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/Content.java b/src/java/org/apache/nutch/protocol/Content.java
index f4c4098..77f9b51 100755
--- a/src/java/org/apache/nutch/protocol/Content.java
+++ b/src/java/org/apache/nutch/protocol/Content.java
@@ -23,6 +23,7 @@ import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.zip.InflaterInputStream;
@@ -265,7 +266,7 @@ public final class Content implements Writable {
buffer.append("contentType: " + contentType + "\n");
buffer.append("metadata: " + metadata + "\n");
buffer.append("Content:\n");
- buffer.append(new String(content)); // try default encoding
+ buffer.append(new String(content, StandardCharsets.UTF_8)); // try default encoding
return buffer.toString();
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 16e380d..867b71b 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -19,9 +19,11 @@ package org.apache.nutch.protocol;
// JDK imports
import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.Hashtable;
import java.util.StringTokenizer;
@@ -172,7 +174,7 @@ public abstract class RobotRulesParser implements Configurable {
BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
"text/plain", argv[2]);
- LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
+ LineNumberReader testsIn = new LineNumberReader(new InputStreamReader(new FileInputStream(argv[1]), StandardCharsets.UTF_8));
String testPath = testsIn.readLine().trim();
while (testPath != null) {
System.out.println((rules.isAllowed(testPath) ? "allowed"
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/Benchmark.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java
index 6643ba3..68c1755 100644
--- a/src/java/org/apache/nutch/tools/Benchmark.java
+++ b/src/java/org/apache/nutch/tools/Benchmark.java
@@ -17,6 +17,7 @@
package org.apache.nutch.tools;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -55,7 +56,7 @@ public class Benchmark extends Configured implements Tool {
OutputStream os = fs.create(new Path(seedsDir, "seeds"));
for (int i = 0; i < count; i++) {
String url = "http://www.test-" + i + ".com/\r\n";
- os.write(url.getBytes());
+ os.write(url.getBytes(StandardCharsets.UTF_8));
}
os.flush();
os.close();
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/DmozParser.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/DmozParser.java b/src/java/org/apache/nutch/tools/DmozParser.java
index ae63505..03d2662 100644
--- a/src/java/org/apache/nutch/tools/DmozParser.java
+++ b/src/java/org/apache/nutch/tools/DmozParser.java
@@ -19,6 +19,7 @@ package org.apache.nutch.tools;
import java.io.*;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.*;
@@ -195,12 +196,12 @@ public class DmozParser {
if (row != null) {
if (desc.length() > 0) {
row.getMetadata().put(new Utf8("_dmoz_desc_"),
- ByteBuffer.wrap(desc.toString().getBytes()));
+ ByteBuffer.wrap(desc.toString().getBytes(StandardCharsets.UTF_8)));
desc.delete(0, desc.length());
}
if (title.length() > 0) {
row.getMetadata().put(new Utf8("_dmoz_title_"),
- ByteBuffer.wrap(title.toString().getBytes()));
+ ByteBuffer.wrap(title.toString().getBytes(StandardCharsets.UTF_8)));
title.delete(0, title.length());
}
store.put(reversedUrl, row);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/ResolveUrls.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/ResolveUrls.java b/src/java/org/apache/nutch/tools/ResolveUrls.java
index fe8c24f..8c8bf97 100644
--- a/src/java/org/apache/nutch/tools/ResolveUrls.java
+++ b/src/java/org/apache/nutch/tools/ResolveUrls.java
@@ -17,9 +17,10 @@
package org.apache.nutch.tools;
import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
import java.net.InetAddress;
+import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@@ -102,8 +103,7 @@ public class ResolveUrls {
pool = Executors.newFixedThreadPool(numThreads);
// read in the urls file and loop through each line, one url per line
- BufferedReader buffRead = new BufferedReader(new FileReader(new File(
- urlsFile)));
+ BufferedReader buffRead = new BufferedReader(new InputStreamReader(new FileInputStream(urlsFile), StandardCharsets.UTF_8));
String urlStr = null;
while ((urlStr = buffRead.readLine()) != null) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index 2b6a3f9..d3f9799 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -18,6 +18,7 @@ package org.apache.nutch.tools.arc;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPInputStream;
import org.slf4j.Logger;
@@ -269,7 +270,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
}
// create the header and the raw content minus the header
- String header = new String(content, 0, eol).trim();
+ String header = new String(content, 0, eol, StandardCharsets.UTF_8).trim();
byte[] raw = new byte[(content.length - eol) - 1];
System.arraycopy(content, eol + 1, raw, 0, raw.length);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
index fce2d3b..699cfa3 100644
--- a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
+++ b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
@@ -35,6 +35,7 @@ package org.apache.nutch.tools.proxy;
import java.io.IOException;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.util.Random;
import java.util.concurrent.atomic.AtomicLong;
@@ -118,7 +119,7 @@ public class FakeHandler extends AbstractTestbedHandler {
os.write(bytes);
// record URI
String p = "<p>URI: " + uri + "</p>\r\n";
- os.write(p.getBytes());
+ os.write(p.getBytes(StandardCharsets.UTF_8));
// fake some links
String basePath;
String baseDomain;
@@ -142,7 +143,7 @@ public class FakeHandler extends AbstractTestbedHandler {
link += pageSeq.getAndIncrement() + ".html'>";
}
link += "outlink " + i + "</a></p>\r\n";
- os.write(link.getBytes());
+ os.write(link.getBytes(StandardCharsets.UTF_8));
}
baseDomain = u.getHost();
// chop off the TLD
@@ -160,15 +161,15 @@ public class FakeHandler extends AbstractTestbedHandler {
link = "http://" + host + "/";
}
link = "<p><a href='" + link + "'>fake host " + host + "</a></p>\r\n";
- os.write(link.getBytes());
+ os.write(link.getBytes(StandardCharsets.UTF_8));
}
// fake a link to the root URL
link = "<p><a href='" + u.getScheme() + "://" + u.getHost();
if (u.getPort() != 80 && u.getPort() != -1)
link += ":" + u.getPort();
link += "/'>site " + u.getHost() + "</a></p>\r\n";
- os.write(link.getBytes());
- os.write(testB.getBytes());
+ os.write(link.getBytes(StandardCharsets.UTF_8));
+ os.write(testB.getBytes(StandardCharsets.UTF_8));
res.flushBuffer();
} catch (IOException ioe) {
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/Bytes.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java
index 87323a6..db9f468 100644
--- a/src/java/org/apache/nutch/util/Bytes.java
+++ b/src/java/org/apache/nutch/util/Bytes.java
@@ -28,6 +28,7 @@ import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.Comparator;
import java.util.Iterator;
+import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -396,7 +397,7 @@ public class Bytes {
|| " `~!@#$%^&*()-_=+[]{}\\|;:'\",.<>/?".indexOf(ch) >= 0) {
result.append(first.charAt(i));
} else {
- result.append(String.format("\\x%02X", ch));
+ result.append(String.format(Locale.ROOT, "\\x%02X", ch));
}
}
} catch (UnsupportedEncodingException e) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/EncodingDetector.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
index ff6cf00..5b40e29 100644
--- a/src/java/org/apache/nutch/util/EncodingDetector.java
+++ b/src/java/org/apache/nutch/util/EncodingDetector.java
@@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
/**
* A simple class for detecting character encodings.
@@ -72,7 +73,7 @@ public class EncodingDetector {
}
public EncodingClue(String value, String source, int confidence) {
- this.value = value.toLowerCase();
+ this.value = value.toLowerCase(Locale.ROOT);
this.source = source;
this.confidence = confidence;
}
@@ -269,7 +270,7 @@ public class EncodingDetector {
LOG.trace(baseUrl + ": Choosing encoding: " + charset
+ " with confidence " + clue.confidence);
}
- return resolveEncodingAlias(charset).toLowerCase();
+ return resolveEncodingAlias(charset).toLowerCase(Locale.ROOT);
} else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
bestClue = clue;
}
@@ -278,7 +279,7 @@ public class EncodingDetector {
if (LOG.isTraceEnabled()) {
LOG.trace(baseUrl + ": Choosing encoding: " + bestClue);
}
- return bestClue.value.toLowerCase();
+ return bestClue.value.toLowerCase(Locale.ROOT);
}
/** Clears all clues. */
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/TimingUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 8f77969..524bee6 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -18,6 +18,7 @@
package org.apache.nutch.util;
import java.text.NumberFormat;
+import java.util.Locale;
public class TimingUtil {
@@ -45,7 +46,7 @@ public class TimingUtil {
start += TIME_FACTOR[i] * elapsedTime[i];
}
- NumberFormat nf = NumberFormat.getInstance();
+ NumberFormat nf = NumberFormat.getInstance(Locale.ROOT);
nf.setMinimumIntegerDigits(2);
StringBuffer buf = new StringBuffer();
for (int i = 0; i < elapsedTime.length; i++) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/URLUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index df16423..5183ba1 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -19,6 +19,7 @@ package org.apache.nutch.util;
import java.net.MalformedURLException;
import java.net.*;
+import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.nutch.util.domain.DomainSuffix;
@@ -386,7 +387,7 @@ public class URLUtil {
*/
public static String getHost(String url) {
try {
- return new URL(url).getHost().toLowerCase();
+ return new URL(url).getHost().toLowerCase(Locale.ROOT);
} catch (MalformedURLException e) {
return null;
}
@@ -404,7 +405,7 @@ public class URLUtil {
public static String getPage(String url) {
try {
// get the full url, and replace the query string with and empty string
- url = url.toLowerCase();
+ url = url.toLowerCase(Locale.ROOT);
String queryStr = new URL(url).getQuery();
return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
} catch (MalformedURLException e) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 57eb81e..7313a03 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -20,6 +20,7 @@ package org.apache.nutch.util.domain;
import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
+import java.util.Locale;
import org.apache.gora.mapreduce.GoraMapper;
import org.apache.gora.query.Query;
@@ -97,7 +98,7 @@ public class DomainStatistics extends Configured implements Tool {
}
}
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT);
long start = System.currentTimeMillis();
LOG.info("DomainStatistics: starting at " + sdf.format(start));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
index ea19a8a..107771a 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
@@ -18,6 +18,7 @@ package org.apache.nutch.webui.client.impl;
import java.io.Serializable;
import java.text.MessageFormat;
+import java.util.Locale;
import org.apache.commons.lang3.StringUtils;
import org.apache.nutch.webui.client.model.JobConfig;
@@ -68,9 +69,8 @@ public class RemoteCommand implements Serializable {
public String toString() {
String statusInfo = StringUtils.EMPTY;
if (jobInfo != null) {
- statusInfo = MessageFormat.format("{0}", jobInfo.getState());
+ statusInfo = new MessageFormat("{0}", Locale.ROOT).format(jobInfo.getState());
}
- return MessageFormat.format("{0} status: {1}", jobConfig.getType(),
- statusInfo);
+ return new MessageFormat("{0} status: {1}", Locale.ROOT).format(new Object[] {jobConfig.getType(), statusInfo});
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
index f8db384..be427ef 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
@@ -37,6 +37,7 @@ import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
+import java.nio.charset.StandardCharsets;
/** Adds metadata identifying the Creative Commons license used, if any. */
public class CCParseFilter implements ParseFilter {
@@ -87,9 +88,9 @@ public class CCParseFilter implements ParseFilter {
+ " of " + base);
}
page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL),
- ByteBuffer.wrap(licenseUrl.getBytes()));
+ ByteBuffer.wrap(licenseUrl.getBytes(StandardCharsets.UTF_8)));
page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION),
- ByteBuffer.wrap(licenseLocation.getBytes()));
+ ByteBuffer.wrap(licenseLocation.getBytes(StandardCharsets.UTF_8)));
}
if (walker.workType != null) {
@@ -97,7 +98,7 @@ public class CCParseFilter implements ParseFilter {
LOG.debug("CC: found " + walker.workType + " in " + base);
}
page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE),
- ByteBuffer.wrap(walker.workType.getBytes()));
+ ByteBuffer.wrap(walker.workType.getBytes(StandardCharsets.UTF_8)));
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
index 25149be..9e2e75b 100644
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -30,6 +30,7 @@ import java.lang.CharSequence;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map.Entry;
+import java.util.Locale;
/**
* Indexing filter that offers an option to either index all inbound anchor text
@@ -97,7 +98,7 @@ public class AnchorIndexingFilter implements IndexingFilter {
if (deduplicate) {
if (set == null)
set = new HashSet<String>();
- String lcAnchor = anchor.toLowerCase();
+ String lcAnchor = anchor.toLowerCase(Locale.ROOT);
// Check if already processed the current anchor
if (!set.contains(lcAnchor)) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
index 6db3bea..eb1454b 100644
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.html;
import java.io.ByteArrayInputStream;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;
import java.util.Scanner;
@@ -67,7 +68,7 @@ public class HtmlIndexingFilter implements IndexingFilter {
LOG.info("Html indexing for: " + url.toString());
}
ByteArrayInputStream arrayInputStream = new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining());
- Scanner scanner = new Scanner(arrayInputStream);
+ Scanner scanner = new Scanner(arrayInputStream, StandardCharsets.UTF_8.name());
scanner.useDelimiter("\\Z");//To read all scanner content in one String
String data = "";
if (scanner.hasNext()) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
index 2e9da51..206831d 100644
--- a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
+++ b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
@@ -26,6 +26,7 @@ import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import static org.junit.Assert.*;
@@ -81,7 +82,7 @@ public class TestMoreIndexingFilter {
filter.setConf(conf);
WebPage page = WebPage.newBuilder().build();
String url = "http://www.example.com/";
- page.setContent(ByteBuffer.wrap("text".getBytes()));
+ page.setContent(ByteBuffer.wrap("text".getBytes(StandardCharsets.UTF_8)));
page.setTitle(new Utf8("title"));
page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source));
NutchDocument doc = filter.filter(new NutchDocument(), url, page);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index ee0560b..f3af6a9 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -39,6 +39,7 @@ import org.w3c.dom.Node;
import java.lang.CharSequence;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.*;
/**
@@ -67,7 +68,7 @@ public class HTMLLanguageParser implements ParseFilter {
String[] values = p.getProperty(key).split(",", -1);
LANGUAGES_MAP.put(key, key);
for (int i = 0; i < values.length; i++) {
- LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+ LANGUAGES_MAP.put(values[i].trim().toLowerCase(Locale.ROOT), key);
}
}
} catch (Exception e) {
@@ -115,7 +116,7 @@ public class HTMLLanguageParser implements ParseFilter {
if (lang != null) {
page.getMetadata().put(new Utf8(Metadata.LANGUAGE),
- ByteBuffer.wrap(lang.getBytes()));
+ ByteBuffer.wrap(lang.getBytes(StandardCharsets.UTF_8)));
return parse;
}
@@ -255,7 +256,7 @@ public class HTMLLanguageParser implements ParseFilter {
Node attrnode = attrs.item(i);
if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
if ("content-language".equals(attrnode.getNodeValue()
- .toLowerCase())) {
+ .toLowerCase(Locale.ROOT))) {
Node valueattr = attrs.getNamedItem("content");
if (valueattr != null) {
httpEquiv = parseLanguage(valueattr.getNodeValue());
@@ -296,7 +297,7 @@ public class HTMLLanguageParser implements ParseFilter {
code = langs[i].split("-")[0];
code = code.split("_")[0];
// Find the ISO 639 code
- language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+ language = (String) LANGUAGES_MAP.get(code.toLowerCase(Locale.ROOT));
i++;
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
index c98c00f..1432999 100644
--- a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
@@ -31,6 +31,7 @@ import org.junit.Test;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
@@ -107,7 +108,7 @@ public class TestHTMLLanguageParser {
long total = 0;
LanguageIdentifier identifier;
BufferedReader in = new BufferedReader(new InputStreamReader(this
- .getClass().getResourceAsStream("test-referencial.txt")));
+ .getClass().getResourceAsStream("test-referencial.txt"), StandardCharsets.UTF_8));
String line = null;
while ((line = in.readLine()) != null) {
String[] tokens = line.split(";");
@@ -149,7 +150,7 @@ public class TestHTMLLanguageParser {
private WebPage getPage(String text) {
WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(BASE);
- page.setContent(ByteBuffer.wrap(text.getBytes()));
+ page.setContent(ByteBuffer.wrap(text.getBytes(StandardCharsets.UTF_8)));
page.setContentType(new Utf8("text/html"));
page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8,
new Utf8("text/html"));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index d0a4726..0a6121b 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
@@ -516,7 +517,7 @@ public abstract class HttpBase implements Protocol {
System.out.println("Content Length: "
+ content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
- String text = new String(content.getContent());
+ String text = new String(content.getContent(), StandardCharsets.UTF_8);
System.out.println(text);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 1d6ea55..bd64d76 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -28,6 +28,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
+import java.util.Locale;
/**
* This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -50,9 +51,9 @@ public class HttpRobotRulesParser extends RobotRulesParser {
/** Compose unique key to store and access robot rules in cache for given URL */
protected static String getCacheKey(URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+ String protocol = url.getProtocol().toLowerCase(Locale.ROOT); // normalize to lower
// case
- String host = url.getHost().toLowerCase(); // normalize to lower case
+ String host = url.getHost().toLowerCase(Locale.ROOT); // normalize to lower case
int port = url.getPort();
if (port == -1) {
port = url.getDefaultPort();
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 47b41a3..8d033e9 100644
--- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -17,6 +17,8 @@
package org.apache.nutch.protocol.http.api;
+import java.nio.charset.StandardCharsets;
+
import org.junit.Before;
import org.junit.Test;
@@ -80,7 +82,7 @@ public class TestRobotRulesParser {
*/
@Test
public void testRobotsAgent() {
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
CONTENT_TYPE, SINGLE_AGENT);
for (int counter = 0; counter < TEST_PATHS.length; counter++) {
@@ -91,7 +93,7 @@ public class TestRobotRulesParser {
rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);
}
- rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
+ rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
CONTENT_TYPE, MULTIPLE_AGENTS);
for (int counter = 0; counter < TEST_PATHS.length; counter++) {
@@ -112,13 +114,13 @@ public class TestRobotRulesParser {
public void testCrawlDelay() {
// for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be
// returned by the parser
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
CONTENT_TYPE, SINGLE_AGENT);
assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ",
(rules.getCrawlDelay() == 10000));
// for UNKNOWN_AGENT, the default crawl delay must be returned.
- rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
+ rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8),
CONTENT_TYPE, UNKNOWN_AGENT);
assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ",
(rules.getCrawlDelay() == Long.MIN_VALUE));
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 40ba266..d374e95 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -22,8 +22,10 @@ import java.io.Reader;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.InputStreamReader;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.ArrayList;
@@ -82,7 +84,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
*/
public RegexURLFilterBase(File filename) throws IOException,
IllegalArgumentException {
- this(new FileReader(filename));
+ this(new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8));
}
/**
@@ -245,7 +247,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
public static void main(RegexURLFilterBase filter, String args[])
throws IOException, IllegalArgumentException {
- BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
String line;
while ((line = in.readLine()) != null) {
String out = filter.filter(line);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index 2b40b48..ae4660f 100644
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -21,6 +21,10 @@ import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.FileInputStream;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -57,8 +61,8 @@ public abstract class RegexURLFilterBaseTest {
protected void bench(int loops, String file) {
try {
- bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ bench(loops, new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8),
+ new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8));
} catch (Exception e) {
fail(e.toString());
}
@@ -81,8 +85,8 @@ public abstract class RegexURLFilterBaseTest {
protected void test(String file) {
try {
- test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
- new FileReader(SAMPLES + SEPARATOR + file + ".urls"));
+ test(new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8),
+ new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8));
} catch (Exception e) {
fail(e.toString());
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
index f71c5ab..00fa30d 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -20,6 +20,7 @@ package org.apache.nutch.microformats.reltag;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
@@ -171,7 +172,7 @@ public class RelTagParser implements ParseFilter {
sb.append(iter.next());
sb.append("\t");
}
- ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
+ ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes(StandardCharsets.UTF_8));
page.getMetadata().put(new Utf8(REL_TAG), bb);
return parse;
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
index 064b46b..66964de 100644
--- a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
+++ b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
@@ -34,6 +34,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import static org.junit.Assert.assertEquals;
@@ -90,7 +91,7 @@ public class TestRelTagParser {
ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag"));
byte[] byteArray = new byte[bbuf.remaining()];
bbuf.get(byteArray);
- String s = new String(byteArray);
+ String s = new String(byteArray, StandardCharsets.UTF_8);
// bbuf.flip();
assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
expectedRelTags, s);
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 3ba3716..8e079fb 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -22,6 +22,7 @@ import java.net.MalformedURLException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.Locale;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
@@ -320,7 +321,7 @@ public class DOMContentUtils {
if (nodeType == Node.ELEMENT_NODE) {
- nodeName = nodeName.toLowerCase();
+ nodeName = nodeName.toLowerCase(Locale.ROOT);
LinkParams params = linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index 159aa76..3e066c4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -18,6 +18,7 @@
package org.apache.nutch.parse.html;
import java.net.URL;
+import java.util.Locale;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;
@@ -64,7 +65,7 @@ public class HTMLMetaProcessor {
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
- String attrName = attr.getNodeName().toLowerCase();
+ String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
@@ -76,12 +77,12 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
+ String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
- String directives = contentNode.getNodeValue().toLowerCase();
+ String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT);
int index = directives.indexOf("none");
if (index >= 0) {
@@ -116,11 +117,11 @@ public class HTMLMetaProcessor {
if (equivNode != null) {
if (contentNode != null) {
- String name = equivNode.getNodeValue().toLowerCase();
+ String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
- content = content.toLowerCase();
+ content = content.toLowerCase(Locale.ROOT);
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
@@ -140,7 +141,7 @@ public class HTMLMetaProcessor {
}
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
- idx = content.toLowerCase().indexOf("url=");
+ idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
if (idx == -1) { // assume a mis-formatted entry with just the
// url
idx = content.indexOf(';') + 1;
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 5440ec7..3255dcc 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
@@ -182,7 +183,7 @@ public class TestDOMContentUtils {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(
- new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8))),
node);
testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
index 8c58ca4..f390041 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -20,6 +20,7 @@ package org.apache.nutch.parse.html;
import org.apache.nutch.parse.HTMLMetaTags;
import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
import java.net.URL;
import org.cyberneko.html.parsers.*;
@@ -123,7 +124,7 @@ public class TestRobotsMetaProcessor {
}
for (int i = 0; i < tests.length; i++) {
- byte[] bytes = tests[i].getBytes();
+ byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index fc2e930..a481755 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
+import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -150,7 +151,7 @@ public class JSParseFilter implements ParseFilter, Parser {
links = getJSLinks(anode.getNodeValue(), "", base);
} else if (anode.getNodeName().equalsIgnoreCase("href")) {
String val = anode.getNodeValue();
- if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
+ if (val != null && val.toLowerCase(Locale.ROOT).indexOf("javascript:") != -1) {
links = getJSLinks(val, "", base);
}
}
@@ -178,7 +179,7 @@ public class JSParseFilter implements ParseFilter, Parser {
public Parse getParse(String url, WebPage page) {
String type = TableUtil.toString(page.getContentType());
if (type != null && !type.trim().equals("")
- && !type.toLowerCase().startsWith("application/x-javascript"))
+ && !type.toLowerCase(Locale.ROOT).startsWith("application/x-javascript"))
return ParseStatusUtils.getEmptyParse(
ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
+ type + "'", getConf());
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 2aac3c6..f61838c 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -27,6 +27,7 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Map.Entry;
+import java.nio.charset.StandardCharsets;
import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
@@ -83,7 +84,7 @@ public class MetaTagsParser implements ParseFilter {
LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
}
metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag),
- ByteBuffer.wrap(value.getBytes()));
+ ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8)));
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
index 1b42263..a13eac7 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
@@ -48,6 +48,7 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.Map;
+import java.util.Locale;
public class TestMetaTagsParser {
@@ -129,7 +130,7 @@ public class TestMetaTagsParser {
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
- String attrName = attr.getNodeName().toLowerCase();
+ String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
@@ -140,14 +141,14 @@ public class TestMetaTagsParser {
}
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
+ String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
}
}
if (equivNode != null) {
if (contentNode != null) {
- String name = equivNode.getNodeValue().toLowerCase();
+ String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index ae1cb44..ee95862 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -22,6 +22,8 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.Locale;
+import java.nio.charset.StandardCharsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.Outlink;
@@ -321,7 +323,7 @@ public class DOMContentUtils {
if (nodeType == Node.ELEMENT_NODE) {
- nodeName = nodeName.toLowerCase();
+ nodeName = nodeName.toLowerCase(Locale.ROOT);
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 294bde9..0818eff 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -18,6 +18,7 @@
package org.apache.nutch.parse.tika;
import java.net.URL;
+import java.util.Locale;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.*;
@@ -64,7 +65,7 @@ public class HTMLMetaProcessor {
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
- String attrName = attr.getNodeName().toLowerCase();
+ String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
@@ -76,12 +77,12 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
+ String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
- String directives = contentNode.getNodeValue().toLowerCase();
+ String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT);
int index = directives.indexOf("none");
if (index >= 0) {
@@ -116,11 +117,11 @@ public class HTMLMetaProcessor {
if (equivNode != null) {
if (contentNode != null) {
- String name = equivNode.getNodeValue().toLowerCase();
+ String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
- content = content.toLowerCase();
+ content = content.toLowerCase(Locale.ROOT);
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
@@ -140,7 +141,7 @@ public class HTMLMetaProcessor {
}
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
- idx = content.toLowerCase().indexOf("url=");
+ idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
if (idx == -1) { // assume a mis-formatted entry with just the
// url
idx = content.indexOf(';') + 1;
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
index 957d664..06bea9f 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
@@ -39,6 +39,7 @@ import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
import java.net.URL;
import java.util.ArrayList;
import java.util.StringTokenizer;
@@ -210,7 +211,7 @@ public class DOMContentUtilsTest {
// to add once available in Tika
// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
try {
- parser.parse(new ByteArrayInputStream(testPages[i].getBytes()),
+ parser.parse(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8)),
domhandler, tikamd, context);
testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
} catch (Exception e) {
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 3a1204c..350be0e 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -23,6 +23,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
@@ -72,14 +73,14 @@ public class TestImageMetadata {
ByteBuffer bbufW = page.getMetadata().get(new Utf8("width"));
byte[] byteArrayW = new byte[bbufW.remaining()];
bbufW.get(byteArrayW);
- String width = new String(byteArrayW);
+ String width = new String(byteArrayW, StandardCharsets.UTF_8);
assertEquals("121", width);
// assert height
ByteBuffer bbufH = page.getMetadata().get(new Utf8("height"));
byte[] byteArrayH = new byte[bbufH.remaining()];
bbufH.get(byteArrayH);
- String height = new String(byteArrayH);
+ String height = new String(byteArrayH, StandardCharsets.UTF_8);
assertEquals("48", height);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index e7139cc..0695439 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -28,6 +28,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.Collection;
import java.util.HashSet;
@@ -218,7 +219,7 @@ public class File implements Protocol {
}
if (dumpContent) {
- System.out.print(new String(content.getContent()));
+ System.out.print(new String(content.getContent(), StandardCharsets.UTF_8));
}
file = null;
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index 410f8e6..0e5f2b0 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -20,6 +20,7 @@ package org.apache.nutch.protocol.file;
// JDK imports
import java.io.IOException;
import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
@@ -275,7 +276,7 @@ public class FileResponse {
x.append("</pre></body></html>\n");
- return new String(x).getBytes();
+ return new String(x).getBytes(StandardCharsets.UTF_8);
}
}
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index 2478b0a..ffa2091 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.net.InetAddress;
import java.net.Socket;
import java.util.List;
@@ -330,7 +331,7 @@ public class Client extends FTP {
+ ((path == null) ? "" : path));
BufferedReader reader = new BufferedReader(new InputStreamReader(
- socket.getInputStream()));
+ socket.getInputStream(), StandardCharsets.UTF_8));
// force-close data channel socket, when download limit is reached
// boolean mandatory_close = false;
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index ccfae0a..3f3a7e8 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -31,6 +31,7 @@ import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import java.util.HashSet;
+import java.nio.charset.StandardCharsets;
/**
* This class is a protocol plugin used for ftp: scheme. It creates
@@ -243,7 +244,7 @@ public class Ftp implements Protocol {
System.err.println("Last-Modified: "
+ content.getMetadata().get(Response.LAST_MODIFIED));
if (dumpContent) {
- System.out.print(new String(content.getContent()));
+ System.out.print(new String(content.getContent(), StandardCharsets.UTF_8));
}
ftp = null;