You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/26 03:56:20 UTC
svn commit: r1669248 - in /nutch/trunk: ./ src/java/org/apache/nutch/tools/
src/java/org/apache/nutch/util/
Author: mattmann
Date: Thu Mar 26 02:56:20 2015
New Revision: 1669248
URL: http://svn.apache.org/r1669248
Log:
fix for NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro via mattmann).
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Mar 26 02:56:20 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro via mattmann)
+
* NUTCH-1968 File Name too long issue of DumpFileUtil.java file (Xin Zhang, Renxia Wang via mattmann)
* NUTCH-1966 Configuration endpoint for 1x REST API (Sujen Shah via mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java Thu Mar 26 02:56:20 2015
@@ -23,12 +23,17 @@ import java.net.UnknownHostException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Abstract class that implements {@see CommonCrawlFormat} interface.
*
*/
public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
+
protected String url;
protected byte[] content;
@@ -37,32 +42,117 @@ public abstract class AbstractCommonCraw
protected Configuration conf;
- public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf) {
+ protected String keyPrefix;
+
+ public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
this.url = url;
this.content = content;
this.metadata = metadata;
this.conf = conf;
+ this.keyPrefix = keyPrefix;
}
-
+
@Override
- public String getJsonData(boolean mapAll) throws IOException {
- if (mapAll) {
- return getJsonDataAll();
- }
- else {
- return getJsonDataSet();
+ public String getJsonData() throws IOException {
+ try {
+ startObject(null);
+
+ // url
+ writeKeyValue("url", getUrl());
+
+ // timestamp
+ writeKeyValue("timestamp", getTimestamp());
+
+ // request
+ startObject("request");
+ writeKeyValue("method", getMethod());
+ startObject("client");
+ writeKeyValue("hostname", getRequestHostName());
+ writeKeyValue("address", getRequestHostAddress());
+ writeKeyValue("software", getRequestSoftware());
+ writeKeyValue("robots", getRequestRobots());
+ startObject("contact");
+ writeKeyValue("name", getRequestContactName());
+ writeKeyValue("email", getRequestContactEmail());
+ closeObject("contact");
+ closeObject("client");
+ startObject("headers");
+ writeKeyValue("Accept", getRequestAccept());
+ writeKeyValue("Accept-Encoding", getRequestAcceptEncoding());
+ writeKeyValue("Accept-Language", getRequestAcceptLanguage());
+ writeKeyValue("User-Agent", getRequestUserAgent());
+ closeObject("headers");
+ writeKeyNull("body");
+ closeObject("request");
+
+ // response
+ startObject("response");
+ writeKeyValue("status", getResponseStatus());
+ startObject("server");
+ writeKeyValue("hostname", getResponseHostName());
+ writeKeyValue("address", getResponseAddress());
+ closeObject("server");
+ startObject("headers");
+ writeKeyValue("Content-Encoding", getResponseContentEncoding());
+ writeKeyValue("Content-Type", getResponseContentType());
+ writeKeyValue("Date", getResponseDate());
+ writeKeyValue("Server", getResponseServer());
+ for (String name : metadata.names()) {
+ if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) {
+ continue;
+ }
+ writeKeyValue(name, metadata.get(name));
+ }
+ closeObject("headers");
+ writeKeyValue("body", getResponseContent());
+ closeObject("response");
+
+ // key
+ if (!this.keyPrefix.isEmpty()) {
+ this.keyPrefix += "-";
+ }
+ writeKeyValue("key", this.keyPrefix + getKey());
+
+ // imported
+ writeKeyValue("imported", getImported());
+
+ closeObject(null);
+
+ return generateJson();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON:" + ioe.getMessage());
}
}
- protected abstract String getJsonDataSet() throws IOException;
+ // abstract methods
- protected abstract String getJsonDataAll() throws IOException;
+ protected abstract void writeKeyValue(String key, String value) throws IOException;
- protected String ifNullString(String value) {
- return (value != null) ? value : "";
+ protected abstract void writeKeyNull(String key) throws IOException;
+
+ protected abstract void startObject(String key) throws IOException;
+
+ protected abstract void closeObject(String key) throws IOException;
+
+ protected abstract String generateJson() throws IOException;
+
+ // getters
+
+ protected String getUrl() {
+ return url;
+ }
+
+ protected String getTimestamp() {
+ return metadata.get(ifNullString(Metadata.LAST_MODIFIED));
+ }
+
+ protected String getMethod() {
+ return new String("GET");
}
- protected static String getHostName() {
+ protected String getRequestHostName() {
String hostName = "";
try {
hostName = InetAddress.getLocalHost().getHostName();
@@ -72,7 +162,7 @@ public abstract class AbstractCommonCraw
return hostName;
}
- protected static String getHostAddress() {
+ protected String getRequestHostAddress() {
String hostAddress = "";
try {
hostAddress = InetAddress.getLocalHost().getHostAddress();
@@ -81,4 +171,80 @@ public abstract class AbstractCommonCraw
}
return hostAddress;
}
+
+ protected String getRequestSoftware() {
+ return conf.get("http.agent.version", "");
+ }
+
+ protected String getRequestRobots() {
+ return new String("CLASSIC");
+ }
+
+ protected String getRequestContactName() {
+ return conf.get("http.agent.name", "");
+ }
+
+ protected String getRequestContactEmail() {
+ return conf.get("http.agent.email", "");
+ }
+
+ protected String getRequestAccept() {
+ return conf.get("http.accept", "");
+ }
+
+ protected String getRequestAcceptEncoding() {
+ return new String(""); // TODO
+ }
+
+ protected String getRequestAcceptLanguage() {
+ return conf.get("http.accept.language", "");
+ }
+
+ protected String getRequestUserAgent() {
+ return conf.get("http.robots.agents", "");
+ }
+
+ protected String getResponseStatus() {
+ return ifNullString(metadata.get("status"));
+ }
+
+ protected String getResponseHostName() {
+ return URLUtil.getHost(url);
+ }
+
+ protected String getResponseAddress() {
+ return ifNullString(metadata.get("_ip_"));
+ }
+
+ protected String getResponseContentEncoding() {
+ return ifNullString(metadata.get("Content-Encoding"));
+ }
+
+ protected String getResponseContentType() {
+ return ifNullString(metadata.get("Content-Type"));
+ }
+
+ protected String getResponseDate() {
+ return ifNullString(metadata.get("Date"));
+ }
+
+ protected String getResponseServer() {
+ return ifNullString(metadata.get("Server"));
+ }
+
+ protected String getResponseContent() {
+ return new String(content);
+ }
+
+ protected String getKey() {
+ return url;
+ }
+
+ protected String getImported() {
+ return new String(""); // TODO
+ }
+
+ private static String ifNullString(String value) {
+ return (value != null) ? value : "";
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Thu Mar 26 02:56:20 2015
@@ -30,7 +30,6 @@ import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
-import java.security.MessageDigest;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -54,12 +53,15 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
import org.apache.nutch.util.NutchConfiguration;
//Tika imports
import org.apache.tika.Tika;
+
import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -162,6 +164,13 @@ import com.ibm.icu.text.SimpleDateFormat
public class CommonCrawlDataDumper {
private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+
+ // Gzip initialization
+ private FileOutputStream fileOutput = null;
+ private BufferedOutputStream bufOutput = null;
+ private GzipCompressorOutputStream gzipOutput = null;
+ private TarArchiveOutputStream tarOutput = null;
+ private ArrayList<String> fileList = null;
/**
* Main method for invoking this tool
@@ -177,17 +186,20 @@ public class CommonCrawlDataDumper {
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
Option helpOpt = new Option("h", "help", false,
- "show this help message");
+ "show this help message.");
// argument options
Option outputOpt = OptionBuilder
.withArgName("outputDir")
.hasArg()
.withDescription(
- "output directory (which will be created) to host the CBOR data")
+ "output directory (which will be created) to host the CBOR data.")
.create("outputDir");
- Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
- .withDescription("the segment(s) to use").create("segment");
- // GIUSEPPE: create mimetype and gzip options
+ Option segOpt = OptionBuilder
+ .withArgName("segment")
+ .hasArgs()
+ .withDescription("the segment(s) to use")
+ .create("segment");
+ // create mimetype and gzip options
Option mimeOpt = OptionBuilder
.isRequired(false)
.withArgName("mimetype")
@@ -196,11 +208,16 @@ public class CommonCrawlDataDumper {
"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
.create("mimetype");
Option gzipOpt = OptionBuilder
- .isRequired(false)
+ .withArgName("gzip")
.hasArg(false)
.withDescription(
- "an optional flag indicating whether to additionally gzip the data")
+ "an optional flag indicating whether to additionally gzip the data.")
.create("gzip");
+ Option keyPrefixOpt = OptionBuilder
+ .withArgName("keyPrefix")
+ .hasArg(true)
+ .withDescription("an optional prefix for key in the output format.")
+ .create("keyPrefix");
// create the options
Options options = new Options();
@@ -210,6 +227,8 @@ public class CommonCrawlDataDumper {
// create mimetypes and gzip options
options.addOption(mimeOpt);
options.addOption(gzipOpt);
+ // create keyPrefix option
+ options.addOption(keyPrefixOpt);
CommandLineParser parser = new GnuParser();
try {
@@ -224,6 +243,7 @@ public class CommonCrawlDataDumper {
File segmentRootDir = new File(line.getOptionValue("segment"));
String[] mimeTypes = line.getOptionValues("mimetype");
boolean gzip = line.hasOption("gzip");
+ String keyPrefix = line.getOptionValue("keyPrefix", "");
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
@@ -233,7 +253,7 @@ public class CommonCrawlDataDumper {
CommonCrawlDataDumper dumper = new CommonCrawlDataDumper();
- dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+ dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, keyPrefix);
} catch (Exception e) {
LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException(e));
@@ -261,7 +281,7 @@ public class CommonCrawlDataDumper {
* filtered out.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, boolean gzip, String[] mimeTypes) throws Exception {
+ public void dump(File outputDir, File segmentRootDir, boolean gzip, String[] mimeTypes, String keyPrefix) throws Exception {
if (!gzip) {
LOG.info("Gzipping CBOR data has been skipped");
}
@@ -284,22 +304,9 @@ public class CommonCrawlDataDumper {
System.exit(1);
}
- // Gzip initialization
- FileOutputStream fileOutput = null;
- BufferedOutputStream bufOutput = null;
- GzipCompressorOutputStream gzipOutput = null;
- TarArchiveOutputStream tarOutput = null;
-
- ArrayList<String> fileList = null;
-
if (gzip) {
- String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
- fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
- bufOutput = new BufferedOutputStream(fileOutput);
- gzipOutput = new GzipCompressorOutputStream(bufOutput);
- tarOutput = new TarArchiveOutputStream(gzipOutput);
-
- fileList = new ArrayList<String>();
+ fileList = new ArrayList<String>();
+ constructNewStream(outputDir);
}
for (File segment : segmentDirs) {
@@ -334,7 +341,14 @@ public class CommonCrawlDataDumper {
extension = "html";
}
- String filename = baseName + "." + extension;
+ String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+ String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
+ String filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+ String outputFullPath = String.format("%s/%s", fullDir, filename);
+
+ String [] fullPathLevels = fullDir.split(File.separator);
+ String firstLevelDirName = fullPathLevels[fullPathLevels.length-2];
+ String secondLevelDirName = fullPathLevels[fullPathLevels.length-1];
// Encode all filetypes if no mimetypes have been given
Boolean filter = (mimeTypes == null);
@@ -343,8 +357,8 @@ public class CommonCrawlDataDumper {
try {
String mimeType = new Tika().detect(content.getContent());
// Maps file to JSON-based structure
- CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf);
- jsonData = format.getJsonData(false);
+ CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf, keyPrefix);
+ jsonData = format.getJsonData();
collectStats(typeCounts, mimeType);
// collects statistics for the given mimetypes
@@ -352,53 +366,36 @@ public class CommonCrawlDataDumper {
collectStats(filteredCounts, mimeType);
filter = true;
}
- } catch (Exception e) {
- e.printStackTrace();
- LOG.warn("Tika is unable to detect type for: [" + url
- + "]");
+ } catch (IOException ioe) {
+ LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
+ return;
}
if (filter) {
-
byte[] byteData = serializeCBORData(jsonData);
if (!gzip) {
- String outputFullPath = outputDir + File.separator + filename;
+ //String outputFullPath = outputDir + File.separator + filename;
File outputFile = new File(outputFullPath);
if (outputFile.exists()) {
LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
}
else {
LOG.info("Writing: [" + outputFullPath + "]");
- try{
- IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
- }
- catch (Exception e){
- MessageDigest md = MessageDigest.getInstance("MD5");
- md.update(outputFullPath.getBytes());
- byte[] digest = md.digest();
- StringBuffer sb = new StringBuffer();
- for (byte b : digest) {
- sb.append(String.format("%02x", b & 0xff));
- }
- outputFullPath = outputFullPath.substring(0, 32) + "_" + sb.toString();
- File newOutPutFile = new File(outputFullPath);
- IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(newOutPutFile));
- LOG.info("File name is too long. Truncated and MD5 appended.");
- }
+ IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
}
}
else {
- if (fileList.contains(filename)) {
- LOG.info("Skipping compressing: [" + filename + "]: file already exists");
+ if (fileList.contains(outputFullPath)) {
+ LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
}
else {
- fileList.add(filename);
- LOG.info("Compressing: [" + filename + "]");
- TarArchiveEntry tarEntry = new TarArchiveEntry(filename);
+ fileList.add(outputFullPath);
+ LOG.info("Compressing: [" + outputFullPath + "]");
+ TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
tarEntry.setSize(byteData.length);
tarOutput.putArchiveEntry(tarEntry);
- IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+ tarOutput.write(byteData);
tarOutput.closeArchiveEntry();
}
}
@@ -411,15 +408,35 @@ public class CommonCrawlDataDumper {
}
if (gzip) {
+ closeStream();
+ }
+
+ if (!typeCounts.isEmpty()) {
+ LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+ }
+ }
+
+ private void closeStream() {
+ try {
tarOutput.finish();
-
+
tarOutput.close();
gzipOutput.close();
bufOutput.close();
fileOutput.close();
+ } catch (IOException ioe) {
+ LOG.warn("Error in closing stream: " + ioe.getMessage());
}
-
- LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+ }
+
+ private void constructNewStream(File outputDir) throws IOException {
+ String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+ LOG.info("Creating a new gzip archive: " + archiveName);
+ fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
+ bufOutput = new BufferedOutputStream(fileOutput);
+ gzipOutput = new GzipCompressorOutputStream(bufOutput);
+ tarOutput = new TarArchiveOutputStream(gzipOutput);
+ tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
}
private byte[] serializeCBORData(String jsonData) {
@@ -458,8 +475,8 @@ public class CommonCrawlDataDumper {
private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
StringBuilder builder = new StringBuilder();
// print total stats
- builder.append("\n TOTAL Stats:\n");
- builder.append(" {\n");
+ builder.append("\nTOTAL Stats:\n");
+ builder.append("{\n");
for (String mimeType : typeCounts.keySet()) {
builder.append(" {\"mimeType\":\"");
builder.append(mimeType);
@@ -470,8 +487,8 @@ public class CommonCrawlDataDumper {
builder.append("}\n");
// filtered types stats
if (!filteredCounts.isEmpty()) {
- builder.append("\n FILTERED Stats:\n");
- builder.append(" {\n");
+ builder.append("\nFILTERED Stats:\n");
+ builder.append("{\n");
for (String mimeType : filteredCounts.keySet()) {
builder.append(" {\"mimeType\":\"");
builder.append(mimeType);
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Thu Mar 26 02:56:20 2015
@@ -33,5 +33,6 @@ public interface CommonCrawlFormat {
* @param mapAll If {@code true} maps all metdata on the JSON structure.
* @return the JSON data
*/
- public String getJsonData(boolean mapAll) throws IOException;
+ //public String getJsonData(boolean mapAll) throws IOException;
+ public String getJsonData() throws IOException;
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java Thu Mar 26 02:56:20 2015
@@ -17,6 +17,8 @@
package org.apache.nutch.tools;
+import java.io.IOException;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
@@ -34,21 +36,22 @@ public class CommonCrawlFormatFactory {
* @param metadata the metadata.
* @param conf the configuration.
* @return the new {@see CommonCrawlFormat} object.
+ * @throws IOException If any I/O error occurs.
*/
public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, byte[] content,
- Metadata metadata, Configuration conf) {
+ Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
if (formatType == null) {
return null;
}
if (formatType.equalsIgnoreCase("jackson")) {
- return new CommonCrawlFormatJackson(url, content, metadata, conf);
+ return new CommonCrawlFormatJackson(url, content, metadata, conf, keyPrefix);
}
else if (formatType.equalsIgnoreCase("jettinson")) {
- return new CommonCrawlFormatJettinson(url, content, metadata, conf);
+ return new CommonCrawlFormatJettinson(url, content, metadata, conf, keyPrefix);
}
else if (formatType.equalsIgnoreCase("simple")) {
- return new CommonCrawlFormatSimple(url, content, metadata, conf);
+ return new CommonCrawlFormatSimple(url, content, metadata, conf, keyPrefix);
}
return null;
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java Thu Mar 26 02:56:20 2015
@@ -22,9 +22,6 @@ import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
@@ -34,220 +31,52 @@ import com.fasterxml.jackson.core.JsonGe
*
*/
public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
-
- private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
-
- public CommonCrawlFormatJackson(String url, byte[] content,
- Metadata metadata, Configuration conf) {
- super(url, content, metadata, conf);
- }
-
- @Override
- protected String getJsonDataAll() throws IOException {
- JsonFactory factory = new JsonFactory();
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- JsonGenerator generator = null;
-
- try {
- generator = factory.createGenerator(out);
- generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
- generator.writeStartObject();
-
- // url
- generator.writeFieldName("url");
- generator.writeString(url);
-
- // timestamp
- generator.writeFieldName("timestamp");
- generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
-
-
- //request
- generator.writeFieldName("request");
- generator.writeStartObject();
- generator.writeFieldName("method");
- generator.writeString("GET");
- generator.writeFieldName("client");
- generator.writeStartObject();
- generator.writeFieldName("hostname");
- generator.writeString(getHostName());
- generator.writeFieldName("address");
- generator.writeString(getHostAddress());
- generator.writeFieldName("software");
- generator.writeString(conf.get("http.agent.version", ""));
- generator.writeFieldName("robots");
- generator.writeString("classic");
- generator.writeFieldName("contact");
- generator.writeStartObject();
- generator.writeFieldName("name");
- generator.writeString(conf.get("http.agent.name", ""));
- generator.writeFieldName("email");
- generator.writeString(conf.get("http.agent.email", ""));
- generator.writeEndObject();
- generator.writeFieldName("headers");
- generator.writeStartObject();
- generator.writeFieldName("Accept");
- generator.writeString(conf.get("accept", ""));
- generator.writeFieldName("Accept-Encoding");
- generator.writeString(""); // TODO
- generator.writeFieldName("Accept-Language");
- generator.writeString(conf.get("http.accept.language", ""));
- generator.writeFieldName("User-Agent");
- generator.writeString(conf.get("http.robots.agents", ""));
- generator.writeEndObject();
- generator.writeFieldName("body");
- generator.writeNull();
- generator.writeEndObject();
-
- //response
- generator.writeFieldName("response");
- generator.writeStartObject();
- generator.writeFieldName("status");
- generator.writeString(ifNullString(metadata.get("status")));
- generator.writeFieldName("server");
-
- generator.writeStartObject();
- generator.writeFieldName("hostname");
- generator.writeString(URLUtil.getHost(url));
- generator.writeFieldName("address");
- generator.writeString(ifNullString(metadata.get("_ip_")));
- generator.writeEndObject();
-
- generator.writeFieldName("headers");
- generator.writeStartObject();
- for (String name : metadata.names()) {
- generator.writeFieldName(name);
- generator.writeString(ifNullString(metadata.get(name)));
- }
- generator.writeEndObject();
-
- generator.writeFieldName("body");
- generator.writeString(new String(content));
- generator.writeEndObject();
-
- generator.writeFieldName("key");
- generator.writeString(url);
-
- generator.writeFieldName("imported"); // TODO
- generator.writeString("");
-
- generator.writeEndObject();
-
- generator.flush();
-
- return out.toString();
-
- } catch (IOException ioe) {
- LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
- throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage());
- }
- }
-
- @Override
- protected String getJsonDataSet() throws IOException {
- JsonFactory factory = new JsonFactory();
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- JsonGenerator generator = null;
-
- try {
- generator = factory.createGenerator(out);
- generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
- generator.writeStartObject();
-
- // url
- generator.writeFieldName("url");
- generator.writeString(url);
-
- // timestamp
- generator.writeFieldName("timestamp");
- generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
-
- //request
- generator.writeFieldName("request");
- generator.writeStartObject();
- generator.writeFieldName("method");
- generator.writeString("GET");
- generator.writeFieldName("client");
- generator.writeStartObject();
- generator.writeFieldName("hostname");
- generator.writeString(getHostName());
- generator.writeFieldName("address");
- generator.writeString(getHostAddress());
- generator.writeFieldName("software");
- generator.writeString(conf.get("http.agent.version", ""));
- generator.writeFieldName("robots");
- generator.writeString("CLASSIC");
- generator.writeFieldName("contact");
- generator.writeStartObject();
- generator.writeFieldName("name");
- generator.writeString(conf.get("http.agent.name", ""));
- generator.writeFieldName("email");
- generator.writeString(conf.get("http.agent.email", ""));
- generator.writeEndObject();
- generator.writeFieldName("headers");
- generator.writeStartObject();
- generator.writeFieldName("Accept");
- generator.writeString(conf.get("accept", ""));
- generator.writeFieldName("Accept-Encoding");
- generator.writeString(""); // TODO
- generator.writeFieldName("Accept-Language");
- generator.writeString(conf.get("http.accept.language", ""));
- generator.writeFieldName("User-Agent");
- generator.writeString(conf.get("http.robots.agents", ""));
- generator.writeEndObject();
- generator.writeFieldName("body");
- generator.writeNull();
- generator.writeEndObject();
-
- //response
- generator.writeFieldName("response");
- generator.writeStartObject();
- generator.writeFieldName("status");
- generator.writeString(ifNullString(metadata.get("status")));
- generator.writeFieldName("server");
-
- generator.writeStartObject();
- generator.writeFieldName("hostname");
- generator.writeString(URLUtil.getHost(url));
- generator.writeFieldName("address");
- generator.writeString(ifNullString(metadata.get("_ip_")));
- generator.writeEndObject();
-
- generator.writeFieldName("headers");
- generator.writeStartObject();
- generator.writeFieldName("Content-Encoding");
- generator.writeString(ifNullString(metadata.get("Content-Encoding")));
- generator.writeFieldName("Content-Type");
- generator.writeString(ifNullString(metadata.get("Content-Type")));
- generator.writeFieldName("Date");
- generator.writeString(ifNullString(metadata.get("Date")));
- generator.writeFieldName("Server");
- generator.writeString(ifNullString(metadata.get("Server")));
- generator.writeEndObject();
-
- generator.writeFieldName("body");
- generator.writeString(new String(content));
- generator.writeEndObject();
-
- generator.writeFieldName("key");
- generator.writeString(url);
-
- generator.writeFieldName("imported"); // TODO
- generator.writeString("");
-
- generator.writeEndObject();
-
- generator.flush();
-
- return out.toString();
-
- } catch (IOException ioe) {
- LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
- throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage());
- }
- }
+
+ //private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+
+ private ByteArrayOutputStream out;
+
+ private JsonGenerator generator;
+
+ public CommonCrawlFormatJackson(String url, byte[] content,
+ Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
+ super(url, content, metadata, conf, keyPrefix);
+
+ JsonFactory factory = new JsonFactory();
+ this.out = new ByteArrayOutputStream();
+ this.generator = factory.createGenerator(out);
+
+ this.generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+ }
+
+ @Override
+ protected void writeKeyValue(String key, String value) throws IOException {
+ generator.writeFieldName(key);
+ generator.writeString(value);
+ }
+
+ @Override
+ protected void writeKeyNull(String key) throws IOException {
+ generator.writeFieldName(key);
+ generator.writeNull();;
+ }
+
+ @Override
+ protected void startObject(String key) throws IOException {
+ if (key != null) {
+ generator.writeFieldName(key);
+ }
+ generator.writeStartObject();
+ }
+
+ @Override
+ protected void closeObject(String key) throws IOException {
+ generator.writeEndObject();
+ }
+
+ @Override
+ protected String generateJson() throws IOException {
+ this.generator.flush();
+ return this.out.toString();
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java Thu Mar 26 02:56:20 2015
@@ -18,15 +18,13 @@
package org.apache.nutch.tools;
import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;
-import org.mortbay.log.Log;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* This class provides methods to map crawled data on JSON using Jettinson APIs.
@@ -34,135 +32,57 @@ import org.slf4j.LoggerFactory;
*/
public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
- private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+ private Deque<JSONObject> stack;
public CommonCrawlFormatJettinson(String url, byte[] content,
- Metadata metadata, Configuration conf) {
- super(url, content, metadata, conf);
+ Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
+ super(url, content, metadata, conf, keyPrefix);
+
+ stack = new ArrayDeque<JSONObject>();
}
@Override
- protected String getJsonDataAll() throws IOException {
+ protected void writeKeyValue(String key, String value) throws IOException {
+ try {
+ stack.getFirst().put(key, value);
+ } catch (JSONException jsone) {
+ throw new IOException(jsone.getMessage());
+ }
+ }
+
+ @Override
+ protected void writeKeyNull(String key) throws IOException {
+ try {
+ stack.getFirst().put(key, JSONObject.NULL);
+ } catch (JSONException jsone) {
+ throw new IOException(jsone.getMessage());
+ }
+ }
+
+ @Override
+ protected void startObject(String key) throws IOException {
JSONObject object = new JSONObject();
-
+ stack.push(object);
+ }
+
+ @Override
+ protected void closeObject(String key) throws IOException {
try {
- // url
- object.put("url", url);
-
- // timestamp
- object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
-
- // request
- JSONObject requestObject = new JSONObject();
- requestObject.put("method", "GET");
- JSONObject clientObject = new JSONObject();
- clientObject.put("hostname", getHostName());
- clientObject.put("address", getHostAddress());
- clientObject.put("software", conf.get("http.agent.version", ""));
- clientObject.put("robots", "CLASSIC");
- JSONObject contactObject = new JSONObject();
- contactObject.put("name", conf.get("http.agent.name", ""));
- contactObject.put("email", conf.get("http.agent.email", ""));
- clientObject.put("contact", contactObject);
- requestObject.put("client", clientObject);
- JSONObject reqHeadersObject = new JSONObject();
- reqHeadersObject.put("Accept", conf.get("http.accept", ""));
- reqHeadersObject.put("Accept-Encoding", ""); // TODO
- reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", ""));
- reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
- requestObject.put("headers", reqHeadersObject);
- requestObject.put("body", JSONObject.NULL);
- object.put("request", requestObject);
-
- // response
- JSONObject responseObject = new JSONObject();
- responseObject.put("status", ifNullString(metadata.get("status")));
- JSONObject serverObject = new JSONObject();
- serverObject.put("hostname", URLUtil.getHost(url));
- serverObject.put("address", ifNullString(metadata.get("_ip_")));
- responseObject.put("client", serverObject);
- JSONObject respHeadersObject = new JSONObject();
- for (String name : metadata.names()) {
- respHeadersObject.put(name, ifNullString(metadata.get(name)));
+ if (stack.size() > 1) {
+ JSONObject object = stack.pop();
+ stack.getFirst().put(key, object);
}
- responseObject.put("headers", respHeadersObject);
- responseObject.put("body", new String(content));
- object.put("response", responseObject);
-
- // key
- object.put("key", url);
-
- // imported
- object.put("imported", ""); // TODO
-
- return object.toString(2); // INDENTED OUTPUT
-
} catch (JSONException jsone) {
- LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
- throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage());
+ throw new IOException(jsone.getMessage());
}
}
-
+
@Override
- protected String getJsonDataSet() throws IOException {
- JSONObject object = new JSONObject();
-
+ protected String generateJson() throws IOException {
try {
- // url
- object.put("url", url);
-
- // timestamp
- object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
-
- // request
- JSONObject requestObject = new JSONObject();
- requestObject.put("method", "GET");
- JSONObject clientObject = new JSONObject();
- clientObject.put("hostname", getHostName());
- clientObject.put("address", getHostAddress());
- clientObject.put("software", conf.get("http.agent.version", ""));
- clientObject.put("robots", "CLASSIC");
- JSONObject contactObject = new JSONObject();
- contactObject.put("name", conf.get("http.agent.name", ""));
- contactObject.put("email", conf.get("http.agent.email", ""));
- clientObject.put("contact", contactObject);
- requestObject.put("client", clientObject);
- JSONObject reqHeadersObject = new JSONObject();
- reqHeadersObject.put("Accept", conf.get("http.accept", ""));
- reqHeadersObject.put("Accept-Encoding", ""); // TODO
- reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", ""));
- reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
- requestObject.put("headers", reqHeadersObject);
- requestObject.put("body", JSONObject.NULL);
- object.put("request", requestObject);
-
- // response
- JSONObject responseObject = new JSONObject();
- responseObject.put("status", ifNullString(metadata.get("status")));
- JSONObject serverObject = new JSONObject();
- serverObject.put("hostname", URLUtil.getHost(url));
- serverObject.put("address", ifNullString(metadata.get("_ip_")));
- responseObject.put("client", serverObject);
- JSONObject respHeadersObject = new JSONObject();
- respHeadersObject.put("Content-Encoding", ifNullString(metadata.get("Content-Encoding")));
- respHeadersObject.put("Content-Type", ifNullString(metadata.get("Content-Type")));
- respHeadersObject.put("Date", ifNullString(metadata.get("Date")));
- respHeadersObject.put("Server", ifNullString(metadata.get("Server")));
- responseObject.put("headers", respHeadersObject);
- responseObject.put("body", new String(content));
- object.put("response", responseObject);
-
- // key
- object.put("key", url);
-
- // imported
- object.put("imported", ""); // TODO
-
- return object.toString(2); // INDENTED OUTPUT
-
+ return stack.getFirst().toString(2);
} catch (JSONException jsone) {
- LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
- throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage());
+ throw new IOException(jsone.getMessage());
}
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Thu Mar 26 02:56:20 2015
@@ -17,9 +17,10 @@
package org.apache.nutch.tools;
+import java.io.IOException;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
/**
* This class provides methods to map crawled data on JSON using a {@see StringBuilder} object.
@@ -27,126 +28,113 @@ import org.apache.nutch.util.URLUtil;
*/
public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+ private StringBuilder sb;
+
+ private int tabCount;
+
public CommonCrawlFormatSimple(String url, byte[] content, Metadata metadata,
- Configuration conf) {
- super(url, content, metadata, conf);
+ Configuration conf, String keyPrefix) throws IOException {
+ super(url, content, metadata, conf, keyPrefix);
+
+ this.sb = new StringBuilder();
+ this.tabCount = 0;
}
- @Override
- protected String getJsonDataAll() {
- // TODO character escaping
- StringBuilder sb = new StringBuilder();
- sb.append("{\n");
-
- // url
- sb.append("\t\"url\": \"" + url + "\",\n");
-
- // timstamp
- sb.append("\t\"timstamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-
- // request
- sb.append("\t\"request\": {\n");
- sb.append("\t\t\"method\": \"GET\",\n");
- sb.append("\t\t\"client\": {\n");
- sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
- sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
- sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
- sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
- sb.append("\t\t\t\"contact\": {\n");
- sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
- sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
- sb.append("\t\t\t}\n");
- sb.append("\t\t},\n");
- sb.append("\t\t\"headers\": {\n");
- sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
- sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
- sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
- sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");
- sb.append("\t},\n");
-
- // response
- sb.append("\t\"response\": {\n");
- sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
- sb.append("\t\t\"server\": {\n");
- sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n");
- sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
- sb.append("\t\t},\n");
- sb.append("\t\t\"headers\": {\n");
- for (String name : metadata.names()) {
- sb.append("\t\t\t\"" + name + "\": \"" + metadata.get(name) + "\"\n");
+ protected void writeKeyValue(String key, String value) throws IOException {
+ sb.append(printTabs() + "\"" + key + "\": " + quote(value) + ",\n");
+ }
+
+ protected void writeKeyNull(String key) throws IOException {
+ sb.append(printTabs() + "\"" + key + "\": null,\n");
+ }
+
+ protected void startObject(String key) throws IOException {
+ String name = "";
+ if (key != null) {
+ name = "\"" + key + "\": ";
}
- sb.append("\t\t},\n");
- sb.append("\t\t\"body\": " + new String(content) + "\",\n");
- sb.append("\t},\n");
-
- // key
- sb.append("\t\"key\": \"" + url + "\",\n");
-
- // imported
- sb.append("\t\"imported\": \"\"\n"); //TODO
-
- sb.append("}");
-
+ sb.append(printTabs() + name + "{\n");
+ this.tabCount++;
+ }
+
+ protected void closeObject(String key) throws IOException {
+ sb.deleteCharAt(sb.length()-2); // delete comma
+ this.tabCount--;
+ sb.append(printTabs() + "},\n");
+ }
+
+ protected String generateJson() throws IOException {
+ sb.deleteCharAt(sb.length()-1); // delete new line
+ sb.deleteCharAt(sb.length()-1); // delete comma
return sb.toString();
}
- @Override
- protected String getJsonDataSet() {
- // TODO character escaping
+ private String printTabs() {
StringBuilder sb = new StringBuilder();
- sb.append("{\n");
-
- // url
- sb.append("\t\"url\": \"" + url + "\",\n");
-
- // timstamp
- sb.append("\t\"timestamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-
- // request
- sb.append("\t\"request\": {\n");
- sb.append("\t\t\"method\": \"GET\",\n");
- sb.append("\t\t\"client\": {\n");
- sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
- sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
- sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
- sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
- sb.append("\t\t\t\"contact\": {\n");
- sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
- sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
- sb.append("\t\t\t}\n");
- sb.append("\t\t},\n");
- sb.append("\t\t\"headers\": {\n");
- sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
- sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
- sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
- sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");
- sb.append("\t},\n");
-
- // response
- sb.append("\t\"response\": {\n");
- sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
- sb.append("\t\t\"server\": {\n");
- sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n");
- sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
- sb.append("\t\t},\n");
- sb.append("\t\t\"headers\": {\n");
- sb.append("\t\t\t\"Content-Encoding\": " + ifNullString(metadata.get("Content-Encoding")));
- sb.append("\t\t\t\"Content-Type\": " + ifNullString(metadata.get("Content-Type")));
- sb.append("\t\t\t\"Date\": " + ifNullString(metadata.get("Date")));
- sb.append("\t\t\t\"Server\": " + ifNullString(metadata.get("Server")));
- sb.append("\t\t},\n");
- sb.append("\t\t\"body\": " + new String(content) + "\",\n");
- sb.append("\t},\n");
-
- // key
- sb.append("\t\"key\": \"" + url + "\",\n");
-
- // imported
- sb.append("\t\"imported\": \"\"\n"); // TODO
-
- sb.append("}");
-
+ for (int i=0; i < this.tabCount ;i++) {
+ sb.append("\t");
+ }
return sb.toString();
}
-
+
+ private static String quote(String string) throws IOException {
+ StringBuilder sb = new StringBuilder();
+
+ if (string == null || string.length() == 0) {
+ sb.append("\"\"");
+ return sb.toString();
+ }
+
+ char b;
+ char c = 0;
+ String hhhh;
+ int i;
+ int len = string.length();
+
+ sb.append('"');
+ for (i = 0; i < len; i += 1) {
+ b = c;
+ c = string.charAt(i);
+ switch (c) {
+ case '\\':
+ case '"':
+ sb.append('\\');
+ sb.append(c);
+ break;
+ case '/':
+ if (b == '<') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ break;
+ case '\b':
+ sb.append("\\b");
+ break;
+ case '\t':
+ sb.append("\\t");
+ break;
+ case '\n':
+ sb.append("\\n");
+ break;
+ case '\f':
+ sb.append("\\f");
+ break;
+ case '\r':
+ sb.append("\\r");
+ break;
+ default:
+ if (c < ' ' || (c >= '\u0080' && c < '\u00a0')
+ || (c >= '\u2000' && c < '\u2100')) {
+ sb.append("\\u");
+ hhhh = Integer.toHexString(c);
+ sb.append("0000", 0, 4 - hhhh.length());
+ sb.append(hhhh);
+ } else {
+ sb.append(c);
+ }
+ }
+ }
+ sb.append('"');
+ return sb.toString();
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Thu Mar 26 02:56:20 2015
@@ -46,21 +46,27 @@ public class DumpFileUtil {
return sb.toString();
}
- public static String createTwoLevelsDirectory(String basePath, String md5) {
+ public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) {
String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
- try {
- FileUtils.forceMkdir(new File(fullDirPath));
- } catch (IOException e) {
- LOG.error("Failed to create dir: {}", fullDirPath);
- fullDirPath = null;
+ if (makeDir) {
+ try {
+ FileUtils.forceMkdir(new File(fullDirPath));
+ } catch (IOException e) {
+ LOG.error("Failed to create dir: {}", fullDirPath);
+ fullDirPath = null;
+ }
}
return fullDirPath;
}
+
+ public static String createTwoLevelsDirectory(String basePath, String md5) {
+ return createTwoLevelsDirectory(basePath, md5, true);
+ }
public static String createFileName(String md5, String fileBaseName, String fileExtension) {
if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {