You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/05/07 20:39:51 UTC
[4/6] nutch git commit: Code style : 2 spaces instead of tabs
Code style : 2 spaces instead of tabs
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f5adbcc3
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f5adbcc3
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f5adbcc3
Branch: refs/heads/master
Commit: f5adbcc3c3bb447110b6733e0851b931e57465c3
Parents: 298cffc
Author: Thamme Gowda <tg...@gmail.com>
Authored: Sat Apr 30 17:15:50 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Sat Apr 30 17:15:50 2016 -0700
----------------------------------------------------------------------
.../nutch/tools/AbstractCommonCrawlFormat.java | 694 +++++++++----------
1 file changed, 347 insertions(+), 347 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/f5adbcc3/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
index d5a0154..1b425c4 100644
--- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
@@ -43,351 +43,351 @@ import com.ibm.icu.text.SimpleDateFormat;
*
*/
public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
- protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
-
- protected String url;
-
- protected Content content;
-
- protected Metadata metadata;
-
- protected Configuration conf;
-
- protected String keyPrefix;
-
- protected boolean simpleDateFormat;
-
- protected boolean jsonArray;
-
- protected boolean reverseKey;
-
- protected String reverseKeyValue;
-
- protected List<String> inLinks;
-
- public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
- this.url = url;
- this.content = content;
- this.metadata = metadata;
- this.conf = nutchConf;
-
- this.keyPrefix = config.getKeyPrefix();
- this.simpleDateFormat = config.getSimpleDateFormat();
- this.jsonArray = config.getJsonArray();
- this.reverseKey = config.getReverseKey();
- this.reverseKeyValue = config.getReverseKeyValue();
- }
-
- public String getJsonData(String url, Content content, Metadata metadata)
- throws IOException {
- this.url = url;
- this.content = content;
- this.metadata = metadata;
-
- return this.getJsonData();
- }
-
- public String getJsonData(String url, Content content, Metadata metadata,
- ParseData parseData) throws IOException {
-
- // override of this is required in the actual formats
- throw new NotImplementedException();
- }
-
- @Override
- public String getJsonData() throws IOException {
- try {
- startObject(null);
-
- // url
- writeKeyValue("url", getUrl());
-
- // timestamp
- writeKeyValue("timestamp", getTimestamp());
-
- // request
- startObject("request");
- writeKeyValue("method", getMethod());
- startObject("client");
- writeKeyValue("hostname", getRequestHostName());
- writeKeyValue("address", getRequestHostAddress());
- writeKeyValue("software", getRequestSoftware());
- writeKeyValue("robots", getRequestRobots());
- startObject("contact");
- writeKeyValue("name", getRequestContactName());
- writeKeyValue("email", getRequestContactEmail());
- closeObject("contact");
- closeObject("client");
- // start request headers
- startHeaders("headers", false, true);
- writeKeyValueWrapper("Accept", getRequestAccept());
- writeKeyValueWrapper("Accept-Encoding", getRequestAcceptEncoding());
- writeKeyValueWrapper("Accept-Language", getRequestAcceptLanguage());
- writeKeyValueWrapper("User-Agent", getRequestUserAgent());
- //closeObject("headers");
- closeHeaders("headers", false, true);
- writeKeyNull("body");
- closeObject("request");
-
- // response
- startObject("response");
- writeKeyValue("status", getResponseStatus());
- startObject("server");
- writeKeyValue("hostname", getResponseHostName());
- writeKeyValue("address", getResponseAddress());
- closeObject("server");
- // start response headers
- startHeaders("headers", false, true);
- writeKeyValueWrapper("Content-Encoding", getResponseContentEncoding());
- writeKeyValueWrapper("Content-Type", getResponseContentType());
- writeKeyValueWrapper("Date", getResponseDate());
- writeKeyValueWrapper("Server", getResponseServer());
- for (String name : metadata.names()) {
- if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) {
- continue;
- }
- writeKeyValueWrapper(name, metadata.get(name));
- }
- closeHeaders("headers", false, true);
- writeKeyValue("body", getResponseContent());
- closeObject("response");
-
- // key
- if (!this.keyPrefix.isEmpty()) {
- this.keyPrefix += "-";
- }
- writeKeyValue("key", this.keyPrefix + getKey());
-
- // imported
- writeKeyValue("imported", getImported());
-
- if (getInLinks() != null){
- startArray("inlinks", false, true);
- for (String link : getInLinks()) {
- writeArrayValue(link);
- }
- closeArray("inlinks", false, true);
- }
- closeObject(null);
-
- return generateJson();
-
- } catch (IOException ioe) {
- LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
- throw new IOException("Error in generating JSON:" + ioe.getMessage());
- }
- }
-
- // abstract methods
-
- protected abstract void writeKeyValue(String key, String value) throws IOException;
-
- protected abstract void writeKeyNull(String key) throws IOException;
-
- protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException;
-
- protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException;
-
- protected abstract void writeArrayValue(String value) throws IOException;
-
- protected abstract void startObject(String key) throws IOException;
-
- protected abstract void closeObject(String key) throws IOException;
-
- protected abstract String generateJson() throws IOException;
-
- // getters
-
- protected String getUrl() {
- try {
- return URIUtil.encodePath(url);
- } catch (URIException e) {
- LOG.error("Can't encode URL " + url);
- }
-
- return url;
- }
-
- protected String getTimestamp() {
- if (this.simpleDateFormat) {
- String timestamp = null;
- try {
- long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime();
- timestamp = String.valueOf(epoch);
- } catch (ParseException pe) {
- LOG.warn(pe.getMessage());
- }
- return timestamp;
- } else {
- return ifNullString(metadata.get(Metadata.LAST_MODIFIED));
- }
- }
-
- protected String getMethod() {
- return new String("GET");
- }
-
- protected String getRequestHostName() {
- String hostName = "";
- try {
- hostName = InetAddress.getLocalHost().getHostName();
- } catch (UnknownHostException uhe) {
-
- }
- return hostName;
- }
-
- protected String getRequestHostAddress() {
- String hostAddress = "";
- try {
- hostAddress = InetAddress.getLocalHost().getHostAddress();
- } catch (UnknownHostException uhe) {
-
- }
- return hostAddress;
- }
-
- protected String getRequestSoftware() {
- return conf.get("http.agent.version", "");
- }
-
- protected String getRequestRobots() {
- return new String("CLASSIC");
- }
-
- protected String getRequestContactName() {
- return conf.get("http.agent.name", "");
- }
-
- protected String getRequestContactEmail() {
- return conf.get("http.agent.email", "");
- }
-
- protected String getRequestAccept() {
- return conf.get("http.accept", "");
- }
-
- protected String getRequestAcceptEncoding() {
- return new String(""); // TODO
- }
-
- protected String getRequestAcceptLanguage() {
- return conf.get("http.accept.language", "");
- }
-
- protected String getRequestUserAgent() {
- return conf.get("http.robots.agents", "");
- }
-
- protected String getResponseStatus() {
- return ifNullString(metadata.get("status"));
- }
-
- protected String getResponseHostName() {
- return URLUtil.getHost(url);
- }
-
- protected String getResponseAddress() {
- return ifNullString(metadata.get("_ip_"));
- }
-
- protected String getResponseContentEncoding() {
- return ifNullString(metadata.get("Content-Encoding"));
- }
-
- protected String getResponseContentType() {
- return ifNullString(metadata.get("Content-Type"));
- }
-
- public List<String> getInLinks() {
- return inLinks;
- }
-
- public void setInLinks(List<String> inLinks) {
- this.inLinks = inLinks;
- }
-
- protected String getResponseDate() {
- if (this.simpleDateFormat) {
- String timestamp = null;
- try {
- long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
- timestamp = String.valueOf(epoch);
- } catch (ParseException pe) {
- LOG.warn(pe.getMessage());
- }
- return timestamp;
- } else {
- return ifNullString(metadata.get("Date"));
- }
- }
-
- protected String getResponseServer() {
- return ifNullString(metadata.get("Server"));
- }
-
- protected String getResponseContent() {
- return new String(content.getContent());
- }
-
- protected String getKey() {
- if (this.reverseKey) {
- return this.reverseKeyValue;
- }
- else {
- return url;
- }
- }
-
- protected String getImported() {
- if (this.simpleDateFormat) {
- String timestamp = null;
- try {
- long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
- timestamp = String.valueOf(epoch);
- } catch (ParseException pe) {
- LOG.warn(pe.getMessage());
- }
- return timestamp;
- } else {
- return ifNullString(metadata.get("Date"));
- }
- }
-
- private static String ifNullString(String value) {
- return (value != null) ? value : "";
- }
-
- private void startHeaders(String key, boolean nested, boolean newline) throws IOException {
- if (this.jsonArray) {
- startArray(key, nested, newline);
- }
- else {
- startObject(key);
- }
- }
-
- private void closeHeaders(String key, boolean nested, boolean newline) throws IOException {
- if (this.jsonArray) {
- closeArray(key, nested, newline);
- }
- else {
- closeObject(key);
- }
- }
-
- private void writeKeyValueWrapper(String key, String value) throws IOException {
- if (this.jsonArray) {
- startArray(null, true, false);
- writeArrayValue(key);
- writeArrayValue(value);
- closeArray(null, true, false);
- }
- else {
- writeKeyValue(key, value);
- }
- }
-
- @Override
- public void close() {}
+ protected static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
+
+ protected String url;
+
+ protected Content content;
+
+ protected Metadata metadata;
+
+ protected Configuration conf;
+
+ protected String keyPrefix;
+
+ protected boolean simpleDateFormat;
+
+ protected boolean jsonArray;
+
+ protected boolean reverseKey;
+
+ protected String reverseKeyValue;
+
+ protected List<String> inLinks;
+
+ public AbstractCommonCrawlFormat(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
+ this.url = url;
+ this.content = content;
+ this.metadata = metadata;
+ this.conf = nutchConf;
+
+ this.keyPrefix = config.getKeyPrefix();
+ this.simpleDateFormat = config.getSimpleDateFormat();
+ this.jsonArray = config.getJsonArray();
+ this.reverseKey = config.getReverseKey();
+ this.reverseKeyValue = config.getReverseKeyValue();
+ }
+
+ public String getJsonData(String url, Content content, Metadata metadata)
+ throws IOException {
+ this.url = url;
+ this.content = content;
+ this.metadata = metadata;
+
+ return this.getJsonData();
+ }
+
+ public String getJsonData(String url, Content content, Metadata metadata,
+ ParseData parseData) throws IOException {
+
+ // override of this is required in the actual formats
+ throw new NotImplementedException();
+ }
+
+ @Override
+ public String getJsonData() throws IOException {
+ try {
+ startObject(null);
+
+ // url
+ writeKeyValue("url", getUrl());
+
+ // timestamp
+ writeKeyValue("timestamp", getTimestamp());
+
+ // request
+ startObject("request");
+ writeKeyValue("method", getMethod());
+ startObject("client");
+ writeKeyValue("hostname", getRequestHostName());
+ writeKeyValue("address", getRequestHostAddress());
+ writeKeyValue("software", getRequestSoftware());
+ writeKeyValue("robots", getRequestRobots());
+ startObject("contact");
+ writeKeyValue("name", getRequestContactName());
+ writeKeyValue("email", getRequestContactEmail());
+ closeObject("contact");
+ closeObject("client");
+ // start request headers
+ startHeaders("headers", false, true);
+ writeKeyValueWrapper("Accept", getRequestAccept());
+ writeKeyValueWrapper("Accept-Encoding", getRequestAcceptEncoding());
+ writeKeyValueWrapper("Accept-Language", getRequestAcceptLanguage());
+ writeKeyValueWrapper("User-Agent", getRequestUserAgent());
+ //closeObject("headers");
+ closeHeaders("headers", false, true);
+ writeKeyNull("body");
+ closeObject("request");
+
+ // response
+ startObject("response");
+ writeKeyValue("status", getResponseStatus());
+ startObject("server");
+ writeKeyValue("hostname", getResponseHostName());
+ writeKeyValue("address", getResponseAddress());
+ closeObject("server");
+ // start response headers
+ startHeaders("headers", false, true);
+ writeKeyValueWrapper("Content-Encoding", getResponseContentEncoding());
+ writeKeyValueWrapper("Content-Type", getResponseContentType());
+ writeKeyValueWrapper("Date", getResponseDate());
+ writeKeyValueWrapper("Server", getResponseServer());
+ for (String name : metadata.names()) {
+ if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) {
+ continue;
+ }
+ writeKeyValueWrapper(name, metadata.get(name));
+ }
+ closeHeaders("headers", false, true);
+ writeKeyValue("body", getResponseContent());
+ closeObject("response");
+
+ // key
+ if (!this.keyPrefix.isEmpty()) {
+ this.keyPrefix += "-";
+ }
+ writeKeyValue("key", this.keyPrefix + getKey());
+
+ // imported
+ writeKeyValue("imported", getImported());
+
+ if (getInLinks() != null){
+ startArray("inlinks", false, true);
+ for (String link : getInLinks()) {
+ writeArrayValue(link);
+ }
+ closeArray("inlinks", false, true);
+ }
+ closeObject(null);
+
+ return generateJson();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON:" + ioe.getMessage());
+ }
+ }
+
+ // abstract methods
+
+ protected abstract void writeKeyValue(String key, String value) throws IOException;
+
+ protected abstract void writeKeyNull(String key) throws IOException;
+
+ protected abstract void startArray(String key, boolean nested, boolean newline) throws IOException;
+
+ protected abstract void closeArray(String key, boolean nested, boolean newline) throws IOException;
+
+ protected abstract void writeArrayValue(String value) throws IOException;
+
+ protected abstract void startObject(String key) throws IOException;
+
+ protected abstract void closeObject(String key) throws IOException;
+
+ protected abstract String generateJson() throws IOException;
+
+ // getters
+
+ protected String getUrl() {
+ try {
+ return URIUtil.encodePath(url);
+ } catch (URIException e) {
+ LOG.error("Can't encode URL " + url);
+ }
+
+ return url;
+ }
+
+ protected String getTimestamp() {
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get(Metadata.LAST_MODIFIED))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return ifNullString(metadata.get(Metadata.LAST_MODIFIED));
+ }
+ }
+
+ protected String getMethod() {
+ return new String("GET");
+ }
+
+ protected String getRequestHostName() {
+ String hostName = "";
+ try {
+ hostName = InetAddress.getLocalHost().getHostName();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostName;
+ }
+
+ protected String getRequestHostAddress() {
+ String hostAddress = "";
+ try {
+ hostAddress = InetAddress.getLocalHost().getHostAddress();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostAddress;
+ }
+
+ protected String getRequestSoftware() {
+ return conf.get("http.agent.version", "");
+ }
+
+ protected String getRequestRobots() {
+ return new String("CLASSIC");
+ }
+
+ protected String getRequestContactName() {
+ return conf.get("http.agent.name", "");
+ }
+
+ protected String getRequestContactEmail() {
+ return conf.get("http.agent.email", "");
+ }
+
+ protected String getRequestAccept() {
+ return conf.get("http.accept", "");
+ }
+
+ protected String getRequestAcceptEncoding() {
+ return new String(""); // TODO
+ }
+
+ protected String getRequestAcceptLanguage() {
+ return conf.get("http.accept.language", "");
+ }
+
+ protected String getRequestUserAgent() {
+ return conf.get("http.robots.agents", "");
+ }
+
+ protected String getResponseStatus() {
+ return ifNullString(metadata.get("status"));
+ }
+
+ protected String getResponseHostName() {
+ return URLUtil.getHost(url);
+ }
+
+ protected String getResponseAddress() {
+ return ifNullString(metadata.get("_ip_"));
+ }
+
+ protected String getResponseContentEncoding() {
+ return ifNullString(metadata.get("Content-Encoding"));
+ }
+
+ protected String getResponseContentType() {
+ return ifNullString(metadata.get("Content-Type"));
+ }
+
+ public List<String> getInLinks() {
+ return inLinks;
+ }
+
+ public void setInLinks(List<String> inLinks) {
+ this.inLinks = inLinks;
+ }
+
+ protected String getResponseDate() {
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return ifNullString(metadata.get("Date"));
+ }
+ }
+
+ protected String getResponseServer() {
+ return ifNullString(metadata.get("Server"));
+ }
+
+ protected String getResponseContent() {
+ return new String(content.getContent());
+ }
+
+ protected String getKey() {
+ if (this.reverseKey) {
+ return this.reverseKeyValue;
+ }
+ else {
+ return url;
+ }
+ }
+
+ protected String getImported() {
+ if (this.simpleDateFormat) {
+ String timestamp = null;
+ try {
+ long epoch = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z").parse(ifNullString(metadata.get("Date"))).getTime();
+ timestamp = String.valueOf(epoch);
+ } catch (ParseException pe) {
+ LOG.warn(pe.getMessage());
+ }
+ return timestamp;
+ } else {
+ return ifNullString(metadata.get("Date"));
+ }
+ }
+
+ private static String ifNullString(String value) {
+ return (value != null) ? value : "";
+ }
+
+ private void startHeaders(String key, boolean nested, boolean newline) throws IOException {
+ if (this.jsonArray) {
+ startArray(key, nested, newline);
+ }
+ else {
+ startObject(key);
+ }
+ }
+
+ private void closeHeaders(String key, boolean nested, boolean newline) throws IOException {
+ if (this.jsonArray) {
+ closeArray(key, nested, newline);
+ }
+ else {
+ closeObject(key);
+ }
+ }
+
+ private void writeKeyValueWrapper(String key, String value) throws IOException {
+ if (this.jsonArray) {
+ startArray(null, true, false);
+ writeArrayValue(key);
+ writeArrayValue(value);
+ closeArray(null, true, false);
+ }
+ else {
+ writeKeyValue(key, value);
+ }
+ }
+
+ @Override
+ public void close() {}
}