You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/26 03:56:20 UTC

svn commit: r1669248 - in /nutch/trunk: ./ src/java/org/apache/nutch/tools/ src/java/org/apache/nutch/util/

Author: mattmann
Date: Thu Mar 26 02:56:20 2015
New Revision: 1669248

URL: http://svn.apache.org/r1669248
Log:
fix for NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro via mattmann).

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
    nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Mar 26 02:56:20 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1974 keyPrefix option for CommonCrawlDataDumper tool (Giuseppe Totaro via mattmann)
+
 * NUTCH-1968 File Name too long issue of DumpFileUtil.java file (Xin Zhang, Renxia Wang via mattmann)
 
 * NUTCH-1966 Configuration endpoint for 1x REST API (Sujen Shah via mattmann)

Modified: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java Thu Mar 26 02:56:20 2015
@@ -23,12 +23,17 @@ import java.net.UnknownHostException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Abstract class that implements {@see CommonCrawlFormat} interface. 
  *
  */
 public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+	private static final Logger LOG = LoggerFactory.getLogger(AbstractCommonCrawlFormat.class.getName());
+	
 	protected String url;
 	
 	protected byte[] content;
@@ -37,32 +42,117 @@ public abstract class AbstractCommonCraw
 	
 	protected Configuration conf;
 	
-	public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf) {
+	protected String keyPrefix;
+	
+	public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
 		this.url = url;
 		this.content = content;
 		this.metadata = metadata;
 		this.conf = conf;
+		this.keyPrefix = keyPrefix;
 	}
-
+	
 	@Override
-	public String getJsonData(boolean mapAll) throws IOException {
-		if (mapAll) {
-			return getJsonDataAll();
-		}
-		else {
-			return getJsonDataSet();
+	public String getJsonData() throws IOException {
+		try {
+			startObject(null);
+			
+			// url
+			writeKeyValue("url", getUrl());
+			
+			// timestamp
+			writeKeyValue("timestamp", getTimestamp());
+			
+			// request
+			startObject("request");
+			writeKeyValue("method", getMethod());
+			startObject("client");
+			writeKeyValue("hostname", getRequestHostName());
+			writeKeyValue("address", getRequestHostAddress());
+			writeKeyValue("software", getRequestSoftware());
+			writeKeyValue("robots", getRequestRobots());
+			startObject("contact");
+			writeKeyValue("name", getRequestContactName());
+			writeKeyValue("email", getRequestContactEmail());
+			closeObject("contact");
+			closeObject("client");
+			startObject("headers");
+			writeKeyValue("Accept", getRequestAccept());
+			writeKeyValue("Accept-Encoding", getRequestAcceptEncoding());
+			writeKeyValue("Accept-Language", getRequestAcceptLanguage());
+			writeKeyValue("User-Agent", getRequestUserAgent());
+			closeObject("headers");
+			writeKeyNull("body");
+			closeObject("request");
+			
+			// response
+			startObject("response");
+			writeKeyValue("status", getResponseStatus());
+			startObject("server");
+			writeKeyValue("hostname", getResponseHostName());
+			writeKeyValue("address", getResponseAddress());
+			closeObject("server");
+			startObject("headers");
+			writeKeyValue("Content-Encoding", getResponseContentEncoding());
+			writeKeyValue("Content-Type", getResponseContentType());
+			writeKeyValue("Date", getResponseDate());
+			writeKeyValue("Server", getResponseServer());
+			for (String name : metadata.names()) {
+				if (name.equalsIgnoreCase("Content-Encoding") || name.equalsIgnoreCase("Content-Type") || name.equalsIgnoreCase("Date") || name.equalsIgnoreCase("Server")) {
+					continue;
+				}
+				writeKeyValue(name, metadata.get(name));
+			}
+			closeObject("headers");
+			writeKeyValue("body", getResponseContent());
+			closeObject("response");
+			
+			// key
+			if (!this.keyPrefix.isEmpty()) {
+				this.keyPrefix += "-";
+			}
+			writeKeyValue("key", this.keyPrefix + getKey());
+			
+			// imported
+			writeKeyValue("imported", getImported());
+			
+			closeObject(null);
+			
+			return generateJson();
+		
+		} catch (IOException ioe) {
+			LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+			throw new IOException("Error in generating JSON:" + ioe.getMessage()); 
 		}
 	}
 	
-	protected abstract String getJsonDataSet() throws IOException;
+	// abstract methods
 	
-	protected abstract String getJsonDataAll() throws IOException;
+	protected abstract void writeKeyValue(String key, String value) throws IOException;
 	
-	protected String ifNullString(String value) {
-		return (value != null) ? value : "";
+	protected abstract void writeKeyNull(String key) throws IOException;
+	
+	protected abstract void startObject(String key) throws IOException;
+	
+	protected abstract void closeObject(String key) throws IOException;
+	
+	protected abstract String generateJson() throws IOException;
+	
+	// getters
+	
+	protected String getUrl() {
+		return url;
+	}
+	
+	protected String getTimestamp() {
+		return metadata.get(ifNullString(Metadata.LAST_MODIFIED));
+	}
+	
+	protected String getMethod() {
+		return new String("GET");
 	}
 	
-	protected static String getHostName() {
+	protected String getRequestHostName() {
 		String hostName = "";
 		try {
 			hostName = InetAddress.getLocalHost().getHostName();
@@ -72,7 +162,7 @@ public abstract class AbstractCommonCraw
 		return hostName;
 	}
 	
-	protected static String getHostAddress() {
+	protected String getRequestHostAddress() {
 		String hostAddress = "";
 		try {
 			hostAddress = InetAddress.getLocalHost().getHostAddress();
@@ -81,4 +171,80 @@ public abstract class AbstractCommonCraw
 		}
 		return hostAddress;
 	}
+	
+	protected String getRequestSoftware() {
+		return conf.get("http.agent.version", "");
+	}
+	
+	protected String getRequestRobots() {
+		return new String("CLASSIC");
+	}
+	
+	protected String getRequestContactName() {
+		return conf.get("http.agent.name", "");
+	}
+	
+	protected String getRequestContactEmail() {
+		return conf.get("http.agent.email", "");
+	}
+	
+	protected String getRequestAccept() {
+		return conf.get("http.accept", "");
+	}
+	
+	protected String getRequestAcceptEncoding() {
+		return new String(""); // TODO
+	}
+	
+	protected String getRequestAcceptLanguage() {
+		return conf.get("http.accept.language", "");
+	}
+	
+	protected String getRequestUserAgent() {
+		return conf.get("http.robots.agents", "");
+	}
+	
+	protected String getResponseStatus() {
+		return ifNullString(metadata.get("status"));
+	}
+	
+	protected String getResponseHostName() {
+		return URLUtil.getHost(url);
+	}
+	
+	protected String getResponseAddress() {
+		return ifNullString(metadata.get("_ip_"));
+	}
+	
+	protected String getResponseContentEncoding() {
+		return ifNullString(metadata.get("Content-Encoding"));
+	}
+	
+	protected String getResponseContentType() {
+		return ifNullString(metadata.get("Content-Type"));
+	}
+	
+	protected String getResponseDate() {
+		return ifNullString(metadata.get("Date"));
+	}
+	
+	protected String getResponseServer() {
+		return ifNullString(metadata.get("Server"));
+	}
+	
+	protected String getResponseContent() {
+		return new String(content);
+	}
+	
+	protected String getKey() {
+		return url;
+	}
+	
+	protected String getImported() {
+		return new String(""); // TODO
+	}
+	
+	private static String ifNullString(String value) {
+		return (value != null) ? value : "";
+	}
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Thu Mar 26 02:56:20 2015
@@ -30,7 +30,6 @@ import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Map;
-import java.security.MessageDigest;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -54,12 +53,15 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DumpFileUtil;
 import org.apache.nutch.util.NutchConfiguration;
 
 //Tika imports
 import org.apache.tika.Tika;
+
 import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
 import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -162,6 +164,13 @@ import com.ibm.icu.text.SimpleDateFormat
 public class CommonCrawlDataDumper {
 
 	private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+	
+	// Gzip initialization
+	private FileOutputStream fileOutput = null;
+	private BufferedOutputStream bufOutput = null;
+	private GzipCompressorOutputStream gzipOutput = null;
+	private TarArchiveOutputStream tarOutput = null;
+	private ArrayList<String> fileList = null;
 
 	/**
 	 * Main method for invoking this tool
@@ -177,17 +186,20 @@ public class CommonCrawlDataDumper {
 	@SuppressWarnings("static-access")
 	public static void main(String[] args) throws Exception {
 		Option helpOpt = new Option("h", "help", false,
-				"show this help message");
+				"show this help message.");
 		// argument options
 		Option outputOpt = OptionBuilder
 				.withArgName("outputDir")
 				.hasArg()
 				.withDescription(
-						"output directory (which will be created) to host the CBOR data")
+						"output directory (which will be created) to host the CBOR data.")
 				.create("outputDir");
-		Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
-				.withDescription("the segment(s) to use").create("segment");
-		// GIUSEPPE: create mimetype and gzip options
+		Option segOpt = OptionBuilder
+				.withArgName("segment")
+				.hasArgs()
+				.withDescription("the segment(s) to use")
+				.create("segment");
+		// create mimetype and gzip options
 		Option mimeOpt = OptionBuilder
 				.isRequired(false)
 				.withArgName("mimetype")
@@ -196,11 +208,16 @@ public class CommonCrawlDataDumper {
 						"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
 				.create("mimetype");
 		Option gzipOpt = OptionBuilder
-				.isRequired(false)
+				.withArgName("gzip")
 				.hasArg(false)
 				.withDescription(
-						"an optional flag indicating whether to additionally gzip the data")
+						"an optional flag indicating whether to additionally gzip the data.")
 				.create("gzip");
+		Option keyPrefixOpt = OptionBuilder
+				.withArgName("keyPrefix")
+				.hasArg(true)
+				.withDescription("an optional prefix for key in the output format.")
+				.create("keyPrefix");
 
 		// create the options
 		Options options = new Options();
@@ -210,6 +227,8 @@ public class CommonCrawlDataDumper {
 		// create mimetypes and gzip options
 		options.addOption(mimeOpt);
 		options.addOption(gzipOpt);
+		// create keyPrefix option
+		options.addOption(keyPrefixOpt);
 
 		CommandLineParser parser = new GnuParser();
 		try {
@@ -224,6 +243,7 @@ public class CommonCrawlDataDumper {
 			File segmentRootDir = new File(line.getOptionValue("segment"));
 			String[] mimeTypes = line.getOptionValues("mimetype");
 			boolean gzip = line.hasOption("gzip");
+			String keyPrefix = line.getOptionValue("keyPrefix", "");
 
 			if (!outputDir.exists()) {
 				LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
@@ -233,7 +253,7 @@ public class CommonCrawlDataDumper {
 
 			CommonCrawlDataDumper dumper = new CommonCrawlDataDumper();
 			
-			dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+			dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes, keyPrefix);
 			
 		} catch (Exception e) {
 			LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException(e));
@@ -261,7 +281,7 @@ public class CommonCrawlDataDumper {
      *            filtered out.
 	 * @throws Exception
 	 */
-	public void dump(File outputDir, File segmentRootDir, boolean gzip,	String[] mimeTypes) throws Exception {
+	public void dump(File outputDir, File segmentRootDir, boolean gzip,	String[] mimeTypes, String keyPrefix) throws Exception {
 		if (!gzip) {
 			LOG.info("Gzipping CBOR data has been skipped");
 		}
@@ -284,22 +304,9 @@ public class CommonCrawlDataDumper {
 			System.exit(1);
 		}
 		
-		// Gzip initialization
-		FileOutputStream fileOutput = null;
-	    BufferedOutputStream bufOutput = null;
-	    GzipCompressorOutputStream gzipOutput = null;
-	    TarArchiveOutputStream tarOutput = null;
-	    
-	    ArrayList<String> fileList = null;
-	    
 		if (gzip) {
-			String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
-		    fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
-		    bufOutput = new BufferedOutputStream(fileOutput);
-		    gzipOutput = new GzipCompressorOutputStream(bufOutput);
-		    tarOutput = new TarArchiveOutputStream(gzipOutput);
-		    
-		    fileList = new ArrayList<String>();
+			fileList = new ArrayList<String>();
+		    constructNewStream(outputDir);
 		}
 
 		for (File segment : segmentDirs) {
@@ -334,7 +341,14 @@ public class CommonCrawlDataDumper {
 						extension = "html";
 					}
 
-					String filename = baseName + "." + extension;
+					String md5Ofurl = DumpFileUtil.getUrlMD5(url);
+					String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl, !gzip);
+					String filename = DumpFileUtil.createFileName(md5Ofurl, baseName, extension);
+					String outputFullPath = String.format("%s/%s", fullDir, filename);
+
+					String [] fullPathLevels = fullDir.split(File.separator);
+					String firstLevelDirName = fullPathLevels[fullPathLevels.length-2]; 
+					String secondLevelDirName = fullPathLevels[fullPathLevels.length-1];
 					
 					// Encode all filetypes if no mimetypes have been given
 					Boolean filter = (mimeTypes == null);
@@ -343,8 +357,8 @@ public class CommonCrawlDataDumper {
 					try {
 						String mimeType = new Tika().detect(content.getContent());
 						// Maps file to JSON-based structure
-						CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf);
-						jsonData = format.getJsonData(false);
+						CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf, keyPrefix);
+						jsonData = format.getJsonData();
 
 						collectStats(typeCounts, mimeType);
 						// collects statistics for the given mimetypes
@@ -352,53 +366,36 @@ public class CommonCrawlDataDumper {
 							collectStats(filteredCounts, mimeType);
 							filter = true;
 						}
-					} catch (Exception e) {
-						e.printStackTrace();
-						LOG.warn("Tika is unable to detect type for: [" + url
-								+ "]");
+					} catch (IOException ioe) { 
+						LOG.error("Fatal error in creating JSON data: " + ioe.getMessage());
+						return;
 					}
 
 					if (filter) {
-						
 						byte[] byteData = serializeCBORData(jsonData);
 						
 						if (!gzip) {
-							String outputFullPath = outputDir + File.separator + filename;
+							//String outputFullPath = outputDir + File.separator + filename;
 							File outputFile = new File(outputFullPath);
 							if (outputFile.exists()) {
 								LOG.info("Skipping writing: [" + outputFullPath	+ "]: file already exists");
 							}
 							else {
 								LOG.info("Writing: [" + outputFullPath + "]");
-								try{
-								    IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
-								}
-								catch (Exception e){
-								    MessageDigest md = MessageDigest.getInstance("MD5");
-								    md.update(outputFullPath.getBytes());
-								    byte[] digest = md.digest();
-								    StringBuffer sb = new StringBuffer();
-								    for (byte b : digest) {
-									   sb.append(String.format("%02x", b & 0xff));
-								    }
-								    outputFullPath = outputFullPath.substring(0, 32) + "_" + sb.toString();
-								    File newOutPutFile = new File(outputFullPath);
-								    IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(newOutPutFile));
-								    LOG.info("File name is too long. Truncated and MD5 appended.");
-								}
+								IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
 							}
 						}
 						else {
-							if (fileList.contains(filename)) {
-								LOG.info("Skipping compressing: [" + filename	+ "]: file already exists");
+							if (fileList.contains(outputFullPath)) {
+								LOG.info("Skipping compressing: [" + outputFullPath + "]: file already exists");
 							}
 							else {
-								fileList.add(filename);
-								LOG.info("Compressing: [" + filename + "]");
-								TarArchiveEntry tarEntry = new TarArchiveEntry(filename);
+								fileList.add(outputFullPath);
+								LOG.info("Compressing: [" + outputFullPath + "]");
+								TarArchiveEntry tarEntry = new TarArchiveEntry(firstLevelDirName + File.separator + secondLevelDirName + File.separator + filename);
 								tarEntry.setSize(byteData.length);
 								tarOutput.putArchiveEntry(tarEntry);
-								IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+								tarOutput.write(byteData);
 								tarOutput.closeArchiveEntry();
 							}
 						}
@@ -411,15 +408,35 @@ public class CommonCrawlDataDumper {
 		}
 		
 		if (gzip) {
+	        closeStream();
+		}
+		
+		if (!typeCounts.isEmpty()) {
+			LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+		}
+	}
+	
+	private void closeStream() {
+		try {
 			tarOutput.finish();
-			 
+			
 	        tarOutput.close();
 	        gzipOutput.close();
 	        bufOutput.close();
 	        fileOutput.close();
+		} catch (IOException ioe) {
+			LOG.warn("Error in closing stream: " + ioe.getMessage());
 		}
-		
-		LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+	}
+	
+	private void constructNewStream(File outputDir) throws IOException {	
+		String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+		LOG.info("Creating a new gzip archive: " + archiveName);
+	    fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
+	    bufOutput = new BufferedOutputStream(fileOutput);
+	    gzipOutput = new GzipCompressorOutputStream(bufOutput);
+	    tarOutput = new TarArchiveOutputStream(gzipOutput);
+	    tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
 	}
 	
 	private byte[] serializeCBORData(String jsonData) {
@@ -458,8 +475,8 @@ public class CommonCrawlDataDumper {
 	private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
 		StringBuilder builder = new StringBuilder();
 		// print total stats
-		builder.append("\n  TOTAL Stats:\n");
-		builder.append("                {\n");
+		builder.append("\nTOTAL Stats:\n");
+		builder.append("{\n");
 		for (String mimeType : typeCounts.keySet()) {
 			builder.append("    {\"mimeType\":\"");
 			builder.append(mimeType);
@@ -470,8 +487,8 @@ public class CommonCrawlDataDumper {
 		builder.append("}\n");
 		// filtered types stats
 		if (!filteredCounts.isEmpty()) {
-			builder.append("\n  FILTERED Stats:\n");
-			builder.append("                {\n");
+			builder.append("\nFILTERED Stats:\n");
+			builder.append("{\n");
 			for (String mimeType : filteredCounts.keySet()) {
 				builder.append("    {\"mimeType\":\"");
 				builder.append(mimeType);

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Thu Mar 26 02:56:20 2015
@@ -33,5 +33,6 @@ public interface CommonCrawlFormat {
 	 * @param mapAll If {@code true} maps all metdata on the JSON structure.
 	 * @return the JSON data
 	 */
-	public String getJsonData(boolean mapAll) throws IOException;
+	//public String getJsonData(boolean mapAll) throws IOException;
+	public String getJsonData() throws IOException;
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java Thu Mar 26 02:56:20 2015
@@ -17,6 +17,8 @@
 
 package org.apache.nutch.tools;
 
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 
@@ -34,21 +36,22 @@ public class CommonCrawlFormatFactory {
 	 * @param metadata the metadata.
 	 * @param conf the configuration.
 	 * @return the new {@see CommonCrawlFormat} object.
+	 * @throws IOException If any I/O error occurs.
 	 */
 	public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, byte[] content,
-			Metadata metadata, Configuration conf) {
+			Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
 		if (formatType == null) {
 			return null;
 		}
 		
 		if (formatType.equalsIgnoreCase("jackson")) {
-			return new CommonCrawlFormatJackson(url, content, metadata, conf);
+			return new CommonCrawlFormatJackson(url, content, metadata, conf, keyPrefix);
 		}
 		else if (formatType.equalsIgnoreCase("jettinson")) {
-			return new CommonCrawlFormatJettinson(url, content, metadata, conf);
+			return new CommonCrawlFormatJettinson(url, content, metadata, conf, keyPrefix);
 		}
 		else if (formatType.equalsIgnoreCase("simple")) {
-			return new CommonCrawlFormatSimple(url, content, metadata, conf);
+			return new CommonCrawlFormatSimple(url, content, metadata, conf, keyPrefix);
 		}
 		
 		return null;

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java Thu Mar 26 02:56:20 2015
@@ -22,9 +22,6 @@ import java.io.IOException;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.core.JsonFactory;
 import com.fasterxml.jackson.core.JsonGenerator;
@@ -34,220 +31,52 @@ import com.fasterxml.jackson.core.JsonGe
  *
  */
 public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
-
-  private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
-
-  public CommonCrawlFormatJackson(String url, byte[] content,
-      Metadata metadata, Configuration conf) {
-    super(url, content, metadata, conf);
-  }
-
-  @Override
-  protected String getJsonDataAll() throws IOException {
-    JsonFactory factory = new JsonFactory();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    JsonGenerator generator = null;
-
-    try {
-      generator = factory.createGenerator(out);
-      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
-      generator.writeStartObject();
-
-      // url
-      generator.writeFieldName("url");
-      generator.writeString(url);
-
-      // timestamp
-      generator.writeFieldName("timestamp");
-      generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
-
-
-      //request
-      generator.writeFieldName("request");
-      generator.writeStartObject();
-      generator.writeFieldName("method");
-      generator.writeString("GET"); 
-      generator.writeFieldName("client");
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(getHostName());
-      generator.writeFieldName("address");
-      generator.writeString(getHostAddress());
-      generator.writeFieldName("software");
-      generator.writeString(conf.get("http.agent.version", ""));
-      generator.writeFieldName("robots");
-      generator.writeString("classic");
-      generator.writeFieldName("contact");
-      generator.writeStartObject();
-      generator.writeFieldName("name");
-      generator.writeString(conf.get("http.agent.name", ""));
-      generator.writeFieldName("email");
-      generator.writeString(conf.get("http.agent.email", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Accept");
-      generator.writeString(conf.get("accept", ""));
-      generator.writeFieldName("Accept-Encoding");
-      generator.writeString(""); // TODO
-      generator.writeFieldName("Accept-Language");
-      generator.writeString(conf.get("http.accept.language", ""));
-      generator.writeFieldName("User-Agent");
-      generator.writeString(conf.get("http.robots.agents", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("body");
-      generator.writeNull();
-      generator.writeEndObject();
-
-      //response
-      generator.writeFieldName("response");
-      generator.writeStartObject();
-      generator.writeFieldName("status");
-      generator.writeString(ifNullString(metadata.get("status")));
-      generator.writeFieldName("server");
-
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(URLUtil.getHost(url)); 
-      generator.writeFieldName("address");
-      generator.writeString(ifNullString(metadata.get("_ip_")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      for (String name : metadata.names()) {
-        generator.writeFieldName(name);
-        generator.writeString(ifNullString(metadata.get(name)));
-      }
-      generator.writeEndObject();
-
-      generator.writeFieldName("body");
-      generator.writeString(new String(content));
-      generator.writeEndObject();
-
-      generator.writeFieldName("key"); 
-      generator.writeString(url);
-
-      generator.writeFieldName("imported"); // TODO
-      generator.writeString("");
-
-      generator.writeEndObject();
-
-      generator.flush();
-
-      return out.toString();
-
-    } catch (IOException ioe) {
-      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
-      throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); 
-    }
-  }
-
-  @Override
-  protected String getJsonDataSet() throws IOException {
-    JsonFactory factory = new JsonFactory();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    JsonGenerator generator = null;
-
-    try {
-      generator = factory.createGenerator(out);
-      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
-
-      generator.writeStartObject();
-
-      // url
-      generator.writeFieldName("url");
-      generator.writeString(url);
-
-      // timestamp
-      generator.writeFieldName("timestamp");
-      generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); 
-
-      //request
-      generator.writeFieldName("request");
-      generator.writeStartObject();
-      generator.writeFieldName("method");
-      generator.writeString("GET");
-      generator.writeFieldName("client");
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(getHostName());
-      generator.writeFieldName("address");
-      generator.writeString(getHostAddress());
-      generator.writeFieldName("software");
-      generator.writeString(conf.get("http.agent.version", ""));
-      generator.writeFieldName("robots");
-      generator.writeString("CLASSIC"); 
-      generator.writeFieldName("contact");
-      generator.writeStartObject();
-      generator.writeFieldName("name");
-      generator.writeString(conf.get("http.agent.name", ""));
-      generator.writeFieldName("email");
-      generator.writeString(conf.get("http.agent.email", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Accept");
-      generator.writeString(conf.get("accept", ""));
-      generator.writeFieldName("Accept-Encoding");
-      generator.writeString(""); // TODO
-      generator.writeFieldName("Accept-Language");
-      generator.writeString(conf.get("http.accept.language", ""));
-      generator.writeFieldName("User-Agent");
-      generator.writeString(conf.get("http.robots.agents", ""));
-      generator.writeEndObject();
-      generator.writeFieldName("body");
-      generator.writeNull();
-      generator.writeEndObject();
-
-      //response
-      generator.writeFieldName("response");
-      generator.writeStartObject();
-      generator.writeFieldName("status");
-      generator.writeString(ifNullString(metadata.get("status")));
-      generator.writeFieldName("server");
-
-      generator.writeStartObject();
-      generator.writeFieldName("hostname");
-      generator.writeString(URLUtil.getHost(url)); 
-      generator.writeFieldName("address");
-      generator.writeString(ifNullString(metadata.get("_ip_")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("headers");
-      generator.writeStartObject();
-      generator.writeFieldName("Content-Encoding");
-      generator.writeString(ifNullString(metadata.get("Content-Encoding")));
-      generator.writeFieldName("Content-Type");
-      generator.writeString(ifNullString(metadata.get("Content-Type")));
-      generator.writeFieldName("Date");
-      generator.writeString(ifNullString(metadata.get("Date")));
-      generator.writeFieldName("Server");
-      generator.writeString(ifNullString(metadata.get("Server")));
-      generator.writeEndObject();
-
-      generator.writeFieldName("body");
-      generator.writeString(new String(content));
-      generator.writeEndObject();
-
-      generator.writeFieldName("key");
-      generator.writeString(url);
-
-      generator.writeFieldName("imported"); // TODO
-      generator.writeString("");
-
-      generator.writeEndObject();
-
-      generator.flush();
-
-      return out.toString();
-
-    } catch (IOException ioe) {
-      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
-      throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); 
-    }
-  }
+	
+	//private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+	
+	private ByteArrayOutputStream out;
+	
+	private JsonGenerator generator;
+
+	public CommonCrawlFormatJackson(String url, byte[] content,
+			Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
+		super(url, content, metadata, conf, keyPrefix);
+		
+		JsonFactory factory = new JsonFactory();
+		this.out = new ByteArrayOutputStream();
+		this.generator = factory.createGenerator(out);
+		
+		this.generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+	}
+	
+	@Override
+	protected void writeKeyValue(String key, String value) throws IOException {
+		generator.writeFieldName(key);
+		generator.writeString(value);
+	}
+	
+	@Override
+	protected void writeKeyNull(String key) throws IOException {
+		generator.writeFieldName(key);
+		generator.writeNull();;
+	}
+	
+	@Override
+	protected void startObject(String key) throws IOException {
+		if (key != null) {
+			generator.writeFieldName(key);
+		}
+		generator.writeStartObject();
+	}
+	
+	@Override
+	protected void closeObject(String key) throws IOException {
+		generator.writeEndObject();
+	}
+	
+	@Override
+	protected String generateJson() throws IOException {
+		this.generator.flush();
+		return this.out.toString();
+	}
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java Thu Mar 26 02:56:20 2015
@@ -18,15 +18,13 @@
 package org.apache.nutch.tools;
 
 import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
 import org.codehaus.jettison.json.JSONException;
 import org.codehaus.jettison.json.JSONObject;
-import org.mortbay.log.Log;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 /**
  * This class provides methods to map crawled data on JSON using Jettinson APIs. 
@@ -34,135 +32,57 @@ import org.slf4j.LoggerFactory;
  */
 public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
 	
-	private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+	private Deque<JSONObject> stack;
 
 	public CommonCrawlFormatJettinson(String url, byte[] content,
-			Metadata metadata, Configuration conf) {
-		super(url, content, metadata, conf);
+			Metadata metadata, Configuration conf, String keyPrefix) throws IOException {
+		super(url, content, metadata, conf, keyPrefix);
+		
+		stack = new ArrayDeque<JSONObject>();
 	}
 	
 	@Override
-	protected String getJsonDataAll() throws IOException {
+	protected void writeKeyValue(String key, String value) throws IOException {
+		try {
+			stack.getFirst().put(key, value);
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected void writeKeyNull(String key) throws IOException {
+		try {
+			stack.getFirst().put(key, JSONObject.NULL);
+		} catch (JSONException jsone) {
+			throw new IOException(jsone.getMessage());
+		}
+	}
+	
+	@Override
+	protected void startObject(String key) throws IOException {
 		JSONObject object = new JSONObject();
-
+		stack.push(object);
+	}
+	
+	@Override
+	protected void closeObject(String key) throws IOException {
 		try {
-			// url
-			object.put("url", url);
-
-			// timestamp
-			object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
-
-			// request
-			JSONObject requestObject = new JSONObject();
-			requestObject.put("method", "GET"); 
-			JSONObject clientObject = new JSONObject();
-			clientObject.put("hostname", getHostName());
-			clientObject.put("address", getHostAddress());
-			clientObject.put("software", conf.get("http.agent.version", ""));
-			clientObject.put("robots", "CLASSIC");
-			JSONObject contactObject = new JSONObject();
-			contactObject.put("name", conf.get("http.agent.name", ""));
-			contactObject.put("email", conf.get("http.agent.email", ""));
-			clientObject.put("contact", contactObject);
-			requestObject.put("client", clientObject);
-			JSONObject reqHeadersObject = new JSONObject();
-			reqHeadersObject.put("Accept", conf.get("http.accept", ""));
-			reqHeadersObject.put("Accept-Encoding", ""); // TODO
-			reqHeadersObject.put("Accept-Language",	conf.get("http.accept.language", ""));
-			reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
-			requestObject.put("headers", reqHeadersObject);
-			requestObject.put("body", JSONObject.NULL);
-			object.put("request", requestObject);
-
-			// response
-			JSONObject responseObject = new JSONObject();
-			responseObject.put("status", ifNullString(metadata.get("status")));
-			JSONObject serverObject = new JSONObject();
-			serverObject.put("hostname", URLUtil.getHost(url));
-			serverObject.put("address", ifNullString(metadata.get("_ip_")));
-			responseObject.put("client", serverObject);
-			JSONObject respHeadersObject = new JSONObject();
-			for (String name : metadata.names()) {
-				respHeadersObject.put(name, ifNullString(metadata.get(name)));
+			if (stack.size() > 1) {
+				JSONObject object = stack.pop();
+				stack.getFirst().put(key, object);
 			}
-			responseObject.put("headers", respHeadersObject);
-			responseObject.put("body", new String(content));
-			object.put("response", responseObject);
-
-			// key
-			object.put("key", url); 
-
-			// imported
-			object.put("imported", ""); // TODO
-
-			return object.toString(2); // INDENTED OUTPUT
-
 		} catch (JSONException jsone) {
-			LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
-			throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); 
+			throw new IOException(jsone.getMessage());
 		}
 	}
-
+	
 	@Override
-	protected String getJsonDataSet() throws IOException {
-		JSONObject object = new JSONObject();
-
+	protected String generateJson() throws IOException {
 		try {
-			// url
-			object.put("url", url);
-
-			// timestamp
-			object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
-
-			// request
-			JSONObject requestObject = new JSONObject();
-			requestObject.put("method", "GET"); 
-			JSONObject clientObject = new JSONObject();
-			clientObject.put("hostname", getHostName());
-			clientObject.put("address", getHostAddress());
-			clientObject.put("software", conf.get("http.agent.version", ""));
-			clientObject.put("robots", "CLASSIC"); 
-			JSONObject contactObject = new JSONObject();
-			contactObject.put("name", conf.get("http.agent.name", ""));
-			contactObject.put("email", conf.get("http.agent.email", ""));
-			clientObject.put("contact", contactObject);
-			requestObject.put("client", clientObject);
-			JSONObject reqHeadersObject = new JSONObject();
-			reqHeadersObject.put("Accept", conf.get("http.accept", ""));
-			reqHeadersObject.put("Accept-Encoding", ""); // TODO
-			reqHeadersObject.put("Accept-Language",	conf.get("http.accept.language", ""));
-			reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", "")); 
-			requestObject.put("headers", reqHeadersObject);
-			requestObject.put("body", JSONObject.NULL);
-			object.put("request", requestObject);
-
-			// response
-			JSONObject responseObject = new JSONObject();
-			responseObject.put("status", ifNullString(metadata.get("status")));
-			JSONObject serverObject = new JSONObject();
-			serverObject.put("hostname", URLUtil.getHost(url)); 
-			serverObject.put("address", ifNullString(metadata.get("_ip_")));
-			responseObject.put("client", serverObject);
-			JSONObject respHeadersObject = new JSONObject();
-			respHeadersObject.put("Content-Encoding", ifNullString(metadata.get("Content-Encoding")));
-			respHeadersObject.put("Content-Type", ifNullString(metadata.get("Content-Type")));
-			respHeadersObject.put("Date", ifNullString(metadata.get("Date")));
-			respHeadersObject.put("Server", ifNullString(metadata.get("Server")));
-			responseObject.put("headers", respHeadersObject);
-			responseObject.put("body", new String(content)); 
-			object.put("response", responseObject);
-
-			// key
-			object.put("key", url);
-
-			// imported
-			object.put("imported", ""); // TODO
-
-			return object.toString(2); // INDENTED OUTPUT
-
+			return stack.getFirst().toString(2);
 		} catch (JSONException jsone) {
-			LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
-			throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); 
+			throw new IOException(jsone.getMessage());
 		}
 	}
 }

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Thu Mar 26 02:56:20 2015
@@ -17,9 +17,10 @@
 
 package org.apache.nutch.tools;
 
+import java.io.IOException;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.util.URLUtil;
 
 /**
  * This class provides methods to map crawled data on JSON using a {@see StringBuilder} object. 
@@ -27,126 +28,113 @@ import org.apache.nutch.util.URLUtil;
  */
 public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
 	
+	private StringBuilder sb;
+	
+	private int tabCount;
+	
 	public CommonCrawlFormatSimple(String url, byte[] content, Metadata metadata,
-			Configuration conf) {
-		super(url, content, metadata, conf);
+			Configuration conf, String keyPrefix) throws IOException {
+		super(url, content, metadata, conf, keyPrefix);
+		
+		this.sb = new StringBuilder();
+		this.tabCount = 0;
 	}
 	
-	@Override
-	protected String getJsonDataAll() {
-		// TODO character escaping
-		StringBuilder sb = new StringBuilder();
-		sb.append("{\n");
-
-		// url
-		sb.append("\t\"url\": \"" + url + "\",\n");
-		
-		// timstamp
-		sb.append("\t\"timstamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-				
-		// request
-		sb.append("\t\"request\": {\n");
-		sb.append("\t\t\"method\": \"GET\",\n");
-		sb.append("\t\t\"client\": {\n");
-		sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
-		sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
-		sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
-		sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
-		sb.append("\t\t\t\"contact\": {\n");
-		sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
-		sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
-		sb.append("\t\t\t}\n");
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"headers\": {\n");
-		sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
-		sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
-		sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
-		sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");  
-		sb.append("\t},\n");
-
-		// response
-		sb.append("\t\"response\": {\n");
-		sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
-		sb.append("\t\t\"server\": {\n");
-		sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
-		sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"headers\": {\n");	
-		for (String name : metadata.names()) {
-			sb.append("\t\t\t\"" + name + "\": \"" + metadata.get(name)	+ "\"\n");
+	protected void writeKeyValue(String key, String value) throws IOException {
+		sb.append(printTabs() + "\"" + key + "\": " + quote(value) + ",\n");
+	}
+	
+	protected void writeKeyNull(String key) throws IOException {
+		sb.append(printTabs() + "\"" + key + "\": null,\n");
+	}
+	
+	protected void startObject(String key) throws IOException {
+		String name = "";
+		if (key != null) {
+			name = "\"" + key + "\": ";
 		}
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"body\": " + new String(content) + "\",\n");
-		sb.append("\t},\n");
-		
-		// key
-		sb.append("\t\"key\": \"" + url + "\",\n");
-		
-		// imported
-		sb.append("\t\"imported\": \"\"\n"); //TODO
-		
-		sb.append("}");
-
+		sb.append(printTabs() + name + "{\n");
+		this.tabCount++;
+	}
+	
+	protected void closeObject(String key) throws IOException {
+		sb.deleteCharAt(sb.length()-2); // delete comma
+		this.tabCount--;
+		sb.append(printTabs() + "},\n");
+	}
+	
+	protected String generateJson() throws IOException {
+		sb.deleteCharAt(sb.length()-1); // delete new line
+		sb.deleteCharAt(sb.length()-1); // delete comma
 		return sb.toString();
 	}
 	
-	@Override
-	protected String getJsonDataSet() {
-		// TODO character escaping
+	private String printTabs() {
 		StringBuilder sb = new StringBuilder();
-		sb.append("{\n");
-		
-		// url
-		sb.append("\t\"url\": \"" + url + "\",\n");
-		
-		// timstamp
-		sb.append("\t\"timestamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
-		
-		// request
-		sb.append("\t\"request\": {\n");
-		sb.append("\t\t\"method\": \"GET\",\n");
-		sb.append("\t\t\"client\": {\n");
-		sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
-		sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
-		sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
-		sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
-		sb.append("\t\t\t\"contact\": {\n");
-		sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
-		sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
-		sb.append("\t\t\t}\n");
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"headers\": {\n");
-		sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
-		sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
-		sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
-    sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");  
-		sb.append("\t},\n");
-		
-		// response
-		sb.append("\t\"response\": {\n");
-		sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
-		sb.append("\t\t\"server\": {\n");
-    sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
-		sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"headers\": {\n");
-		sb.append("\t\t\t\"Content-Encoding\": " + ifNullString(metadata.get("Content-Encoding")));
-		sb.append("\t\t\t\"Content-Type\": " + ifNullString(metadata.get("Content-Type")));
-		sb.append("\t\t\t\"Date\": " + ifNullString(metadata.get("Date")));
-		sb.append("\t\t\t\"Server\": " + ifNullString(metadata.get("Server")));
-		sb.append("\t\t},\n");
-		sb.append("\t\t\"body\": " + new String(content) + "\",\n");
-		sb.append("\t},\n");
-		
-		// key
-		sb.append("\t\"key\": \"" + url + "\",\n"); 
-		
-		// imported
-		sb.append("\t\"imported\": \"\"\n"); // TODO
-		
-		sb.append("}");
-
+		for (int i=0; i < this.tabCount ;i++) {
+			sb.append("\t");
+		}
 		return sb.toString();
 	}
-
+	
+    private static String quote(String string) throws IOException {
+    	StringBuilder sb = new StringBuilder();
+    	
+        if (string == null || string.length() == 0) {
+            sb.append("\"\"");
+            return sb.toString();
+        }
+
+        char b;
+        char c = 0;
+        String hhhh;
+        int i;
+        int len = string.length();
+
+        sb.append('"');
+        for (i = 0; i < len; i += 1) {
+            b = c;
+            c = string.charAt(i);
+            switch (c) {
+            case '\\':
+            case '"':
+                sb.append('\\');
+                sb.append(c);
+                break;
+            case '/':
+                if (b == '<') {
+                	sb.append('\\');
+                }
+                sb.append(c);
+                break;
+            case '\b':
+            	sb.append("\\b");
+                break;
+            case '\t':
+            	sb.append("\\t");
+                break;
+            case '\n':
+            	sb.append("\\n");
+                break;
+            case '\f':
+            	sb.append("\\f");
+                break;
+            case '\r':
+            	sb.append("\\r");
+                break;
+            default:
+                if (c < ' ' || (c >= '\u0080' && c < '\u00a0')
+                        || (c >= '\u2000' && c < '\u2100')) {
+                	sb.append("\\u");
+                    hhhh = Integer.toHexString(c);
+                    sb.append("0000", 0, 4 - hhhh.length());
+                    sb.append(hhhh);
+                } else {
+                	sb.append(c);
+                }
+            }
+        }
+        sb.append('"');
+        return sb.toString();
+    }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java?rev=1669248&r1=1669247&r2=1669248&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DumpFileUtil.java Thu Mar 26 02:56:20 2015
@@ -46,21 +46,27 @@ public class DumpFileUtil {
         return sb.toString();
     }
 
-    public static String createTwoLevelsDirectory(String basePath, String md5) {
+    public static String createTwoLevelsDirectory(String basePath, String md5, boolean makeDir) {
         String firstLevelDirName = new StringBuilder().append(md5.charAt(0)).append(md5.charAt(8)).toString();
         String secondLevelDirName = new StringBuilder().append(md5.charAt(16)).append(md5.charAt(24)).toString();
 
         String fullDirPath = String.format(DIR_PATTERN, basePath, firstLevelDirName, secondLevelDirName);
 
-        try {
-            FileUtils.forceMkdir(new File(fullDirPath));
-        } catch (IOException e) {
-            LOG.error("Failed to create dir: {}", fullDirPath);
-            fullDirPath = null;
+        if (makeDir) {
+	        try {
+	            FileUtils.forceMkdir(new File(fullDirPath));
+	        } catch (IOException e) {
+	            LOG.error("Failed to create dir: {}", fullDirPath);
+	            fullDirPath = null;
+	        }
         }
 
         return fullDirPath;
     }
+    
+    public static String createTwoLevelsDirectory(String basePath, String md5) {
+        return createTwoLevelsDirectory(basePath, md5, true);
+    }
 
     public static String createFileName(String md5, String fileBaseName, String fileExtension) {
         if (fileBaseName.length() > MAX_LENGTH_OF_FILENAME) {