You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/03/04 19:48:32 UTC

svn commit: r1664109 - in /nutch/trunk: ./ ivy/ src/bin/ src/java/org/apache/nutch/tools/

Author: lewismc
Date: Wed Mar  4 18:48:32 2015
New Revision: 1664109

URL: http://svn.apache.org/r1664109
Log:
NUTCH-1949 Dump out the Nutch data into the Common Crawl format

Added:
    nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/bin/nutch
    nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar  4 18:48:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1949 Dump out the Nutch data into the Common Crawl format (Giuseppe Totaro via lewismc)
+
 * NUTCH-1950 File name too long (Jiaheng Zhang, Chong Li via mattmann)
 
 * NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Mar  4 18:48:32 2015
@@ -49,7 +49,8 @@
 			rev="3.1" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.3"
 			conf="*->default" />
-		
+                <dependency org="org.apache.commons" name="commons-compress" rev="1.9" 
+                        conf="*->default" />	
 		<dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
 			conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />
@@ -70,6 +71,9 @@
 		<dependency org="com.google.guava" name="guava" rev="11.0.2" />
 		<dependency org="com.google.code.crawler-commons" name="crawler-commons"
 			rev="0.5" />
+               
+                <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" /> 
+                <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" />
 
 		<!--Configuration: test -->
 

Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed Mar  4 18:48:32 2015
@@ -71,7 +71,8 @@ if [ $# = 0 ]; then
   echo "  mergelinkdb       merge linkdb-s, with optional filtering"
   echo "  index             run the plugin-based indexer on parsed segments and linkdb"
   echo "  dedup             deduplicate entries in the crawldb and give them a special status"
-  echo "  dump              exports cralwed data from segments into files"
+  echo "  dump              exports crawled data from segments into files"
+  echo "  commoncrawldump   exports crawled data from segments into common crawl data format encoded as CBOR"
   echo "  solrindex         run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
   echo "  solrdedup         remove duplicates from solr - DEPRECATED use the dedup command instead"
   echo "  solrclean         remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
@@ -233,6 +234,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; th
   CLASS=org.apache.nutch.crawl.LinkDbMerger
 elif [ "$COMMAND" = "dump" ] ; then
   CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+  CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
 elif [ "$COMMAND" = "solrindex" ] ; then
   CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
   shift

Added: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Abstract class that implements {@see CommonCrawlFormat} interface. 
+ *
+ */
+public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+	protected String url;
+	
+	protected byte[] content;
+	
+	protected Metadata metadata;
+	
+	protected Configuration conf;
+	
+	public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf) {
+		this.url = url;
+		this.content = content;
+		this.metadata = metadata;
+		this.conf = conf;
+	}
+
+	@Override
+	public String getJsonData(boolean mapAll) throws IOException {
+		if (mapAll) {
+			return getJsonDataAll();
+		}
+		else {
+			return getJsonDataSet();
+		}
+	}
+	
+	protected abstract String getJsonDataSet() throws IOException;
+	
+	protected abstract String getJsonDataAll() throws IOException;
+	
+	protected String ifNullString(String value) {
+		return (value != null) ? value : "";
+	}
+	
+	protected static String getHostName() {
+		String hostName = "";
+		try {
+			hostName = InetAddress.getLocalHost().getHostName();
+		} catch (UnknownHostException uhe) {
+			
+		}
+		return hostName;
+	}
+	
+	protected static String getHostAddress() {
+		String hostAddress = "";
+		try {
+			hostAddress = InetAddress.getLocalHost().getHostAddress();
+		} catch (UnknownHostException uhe) {
+			
+		}
+		return hostAddress;
+	}
+}

Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Wed Mar  4 18:48:32 2015
@@ -53,6 +53,7 @@ public class Benchmark extends Configure
     System.exit(res);
   }
 
+  @SuppressWarnings("unused")
   private static String getDate() {
     return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
         .currentTimeMillis()));

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,470 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+//Tika imports
+import org.apache.tika.Tika;
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * <p>
+ * The Common Crawl Data Dumper tool enables one to reverse generate the raw
+ * content from Nutch segment data directories into a common crawling data
+ * format, consumed by many applications. The data is then serialized as <a
+ * href="http://cbor.io">CBOR</a>
+ * </p>
+ * <p>
+ * Text content will be stored in a structured document format. Below is a
+ * schema for storage of data and metadata related to a crawling request, with
+ * the response body truncated for readability. This document must be encoded
+ * using CBOR and should be compressed with gzip after encoding. The timestamped
+ * URL key for these records' keys follows the same layout as the media file
+ * directory structure, with underscores in place of directory separators. </li>
+ * </p>
+ * <p>
+ * Thus, the timestamped url key for the record is provided below followed by an
+ * example record:
+ * 
+ * <pre>
+ * {@code
+ * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
+ *     
+ *     {
+ *         "url": "http:\/\/somepage.com\/22\/14560817",
+ *         "timestamp": "1411623696000",
+ *         "request": {
+ *             "method": "GET",
+ *             "client": {
+ *                 "hostname": "crawler01.local",
+ *                 "address": "74.347.129.200",
+ *                 "software": "Apache Nutch v1.10",
+ *                 "robots": "classic",
+ *                 "contact": {
+ *                     "name": "Nutch Admin",
+ *                     "email": "nutch.pro@nutchadmin.org"
+ *                 }
+ *             },
+ *             "headers": {
+ *                 "Accept": "text\/html,application\/xhtml+xml,application\/xml",
+ *                 "Accept-Encoding": "gzip,deflate,sdch",
+ *                 "Accept-Language": "en-US,en",
+ *                 "User-Agent": "Mozilla\/5.0",
+ *                 "...": "..."
+ *             },
+ *             "body": null
+ *         },
+ *         "response": {
+ *             "status": "200",
+ *             "server": {
+ *                 "hostname": "somepage.com",
+ *                 "address": "55.33.51.19",
+ *             },
+ *             "headers": {
+ *                 "Content-Encoding": "gzip",
+ *                 "Content-Type": "text\/html",
+ *                 "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
+ *                 "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
+ *                 "Server": "nginx",
+ *                 "...": "..."
+ *             },
+ *             "body": "\r\n  <!DOCTYPE html PUBLIC ... \r\n\r\n  \r\n    </body>\r\n    </html>\r\n  \r\n\r\n",    
+ *         },
+ *         "key": "com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
+ *         "imported": "1411623698000"
+ *     }
+ *     }
+ * </pre>
+ * 
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which fall
+ * into those classifications. An example is as follows:
+ * </p>
+ * 
+ * <pre>
+ * {@code
+ * INFO: File Types: 
+ *   TOTAL Stats:    {
+ *     {"mimeType":"application/xml","count":19"}
+ *     {"mimeType":"image/png","count":47"}
+ *     {"mimeType":"image/jpeg","count":141"}
+ *     {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ *     {"mimeType":"text/plain","count":89"}
+ *     {"mimeType":"video/quicktime","count":2"}
+ *     {"mimeType":"image/gif","count":63"}
+ *     {"mimeType":"application/xhtml+xml","count":1670"}
+ *     {"mimeType":"application/octet-stream","count":40"}
+ *     {"mimeType":"text/html","count":1863"}
+ *   }
+ * }
+ * </pre>
+ * 
+ */
+public class CommonCrawlDataDumper {
+
+	private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+
+	/**
+	 * Main method for invoking this tool
+	 * 
+	 * @param args
+	 *            1) output directory (which will be created if it does not
+	 *            already exist) to host the CBOR data and 2) a directory
+	 *            containing one or more segments from which we wish to generate
+	 *            CBOR data from. Optionally, 3) a list of mimetypes and the 4) 
+	 *            the gzip option may be provided.
+	 * @throws Exception
+	 */
+	@SuppressWarnings("static-access")
+	public static void main(String[] args) throws Exception {
+		Option helpOpt = new Option("h", "help", false,
+				"show this help message");
+		// argument options
+		Option outputOpt = OptionBuilder
+				.withArgName("outputDir")
+				.hasArg()
+				.withDescription(
+						"output directory (which will be created) to host the CBOR data")
+				.create("outputDir");
+		Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+				.withDescription("the segment(s) to use").create("segment");
+		// GIUSEPPE: create mimetype and gzip options
+		Option mimeOpt = OptionBuilder
+				.isRequired(false)
+				.withArgName("mimetype")
+				.hasArgs()
+				.withDescription(
+						"an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+				.create("mimetype");
+		Option gzipOpt = OptionBuilder
+				.isRequired(false)
+				.hasArg(false)
+				.withDescription(
+						"an optional flag indicating whether to additionally gzip the data")
+				.create("gzip");
+
+		// create the options
+		Options options = new Options();
+		options.addOption(helpOpt);
+		options.addOption(outputOpt);
+		options.addOption(segOpt);
+		// create mimetypes and gzip options
+		options.addOption(mimeOpt);
+		options.addOption(gzipOpt);
+
+		CommandLineParser parser = new GnuParser();
+		try {
+			CommandLine line = parser.parse(options, args);
+			if (line.hasOption("help") || !line.hasOption("outputDir") || (!line.hasOption("segment"))) {
+				HelpFormatter formatter = new HelpFormatter();
+				formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
+				return;
+			}
+
+			File outputDir = new File(line.getOptionValue("outputDir"));
+			File segmentRootDir = new File(line.getOptionValue("segment"));
+			String[] mimeTypes = line.getOptionValues("mimetype");
+			boolean gzip = line.hasOption("gzip");
+
+			if (!outputDir.exists()) {
+				LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
+				if (!outputDir.mkdirs())
+					throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
+			}
+
+			CommonCrawlDataDumper dumper = new CommonCrawlDataDumper();
+			
+			dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+			
+		} catch (Exception e) {
+			LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException(e));
+			e.printStackTrace();
+			return;
+		}
+	}
+	
+	/**
+	 * Dumps the reverse engineered CBOR content from the provided segment
+	 * directories if a parent directory contains more than one segment,
+	 * otherwise a single segment can be passed as an argument. If the boolean
+	 * argument is provided then the CBOR is also zipped.
+	 * 
+	 * @param outputDir
+	 *            the directory you wish to dump the raw content to. This
+	 *            directory will be created.
+	 * @param segmentRootDir
+	 *            a directory containing one or more segments.
+	 * @param gzip
+	 *            a boolean flag indicating whether the CBOR content should also
+	 *            be gzipped.
+	 * @param mimetypes
+	 *            an array of mime types we have to dump, all others will be
+     *            filtered out.
+	 * @throws Exception
+	 */
+	public void dump(File outputDir, File segmentRootDir, boolean gzip,	String[] mimeTypes) throws Exception {
+		if (!gzip) {
+			LOG.info("Gzipping CBOR data has been skipped");
+		}
+		// total file counts
+		Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+		// filtered file counters
+		Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+		
+		Configuration conf = NutchConfiguration.create();
+		FileSystem fs = FileSystem.get(conf);
+		File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
+			@Override
+			public boolean accept(File file) {
+				return file.canRead() && file.isDirectory();
+			}
+		});
+		
+		if (segmentDirs == null) {
+			LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
+			System.exit(1);
+		}
+		
+		// Gzip initialization
+		FileOutputStream fileOutput = null;
+	    BufferedOutputStream bufOutput = null;
+	    GzipCompressorOutputStream gzipOutput = null;
+	    TarArchiveOutputStream tarOutput = null;
+	    
+	    ArrayList<String> fileList = null;
+	    
+		if (gzip) {
+			String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+		    fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
+		    bufOutput = new BufferedOutputStream(fileOutput);
+		    gzipOutput = new GzipCompressorOutputStream(bufOutput);
+		    tarOutput = new TarArchiveOutputStream(gzipOutput);
+		    
+		    fileList = new ArrayList<String>();
+		}
+
+		for (File segment : segmentDirs) {
+			LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+			// GIUSEPPE: Never used (also in FileDumper.java)!
+			//DataOutputStream doutputStream = null;
+			try {
+				String segmentContentPath = segment.getAbsolutePath() + File.separator + Content.DIR_NAME + "/part-00000/data";
+				Path file = new Path(segmentContentPath);
+
+				if (!new File(file.toString()).exists()) {
+					LOG.warn("Skipping segment: [" + segmentContentPath	+ "]: no data directory present");
+					continue;
+				}
+				SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
+
+				if (!new File(file.toString()).exists()) {
+					LOG.warn("Skipping segment: [" + segmentContentPath	+ "]: no data directory present");
+					continue;
+				}
+				Writable key = (Writable) reader.getKeyClass().newInstance();
+				
+				Content content = null;
+
+				while (reader.next(key)) {
+					content = new Content();
+					reader.getCurrentValue(content);
+					String url = key.toString();
+					String baseName = FilenameUtils.getBaseName(url);
+					String extension = FilenameUtils.getExtension(url);
+					if (extension == null || extension.equals("")) {
+						extension = "html";
+					}
+
+					String filename = baseName + "." + extension;
+					
+					// Encode all filetypes if no mimetypes have been given
+					Boolean filter = (mimeTypes == null);
+					
+					String jsonData = "";
+					try {
+						String mimeType = new Tika().detect(content.getContent());
+						// Maps file to JSON-based structure
+						CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf);
+						jsonData = format.getJsonData(false);
+
+						collectStats(typeCounts, mimeType);
+						// collects statistics for the given mimetypes
+						if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
+							collectStats(filteredCounts, mimeType);
+							filter = true;
+						}
+					} catch (Exception e) {
+						e.printStackTrace();
+						LOG.warn("Tika is unable to detect type for: [" + url
+								+ "]");
+					}
+
+					if (filter) {
+						
+						byte[] byteData = serializeCBORData(jsonData);
+						
+						if (!gzip) {
+							String outputFullPath = outputDir + File.separator + filename;
+							File outputFile = new File(outputFullPath);
+							if (outputFile.exists()) {
+								LOG.info("Skipping writing: [" + outputFullPath	+ "]: file already exists");
+							}
+							else {
+								LOG.info("Writing: [" + outputFullPath + "]");
+								IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
+							}
+						}
+						else {
+							if (fileList.contains(filename)) {
+								LOG.info("Skipping compressing: [" + filename	+ "]: file already exists");
+							}
+							else {
+								fileList.add(filename);
+								LOG.info("Compressing: [" + filename + "]");
+								TarArchiveEntry tarEntry = new TarArchiveEntry(filename);
+								tarEntry.setSize(byteData.length);
+								tarOutput.putArchiveEntry(tarEntry);
+								IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+								tarOutput.closeArchiveEntry();
+							}
+						}
+					}
+				}
+				reader.close();
+			} finally {
+				fs.close();
+			}
+		}
+		
+		if (gzip) {
+			tarOutput.finish();
+			 
+	        tarOutput.close();
+	        gzipOutput.close();
+	        bufOutput.close();
+	        fileOutput.close();
+		}
+		
+		LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+	}
+	
+	private byte[] serializeCBORData(String jsonData) {
+		CBORFactory factory = new CBORFactory();
+		
+		CBORGenerator generator = null;
+		ByteArrayOutputStream stream = null;
+		
+		try {
+			stream = new ByteArrayOutputStream();
+			generator = factory.createGenerator(stream);
+			generator.writeString(jsonData);
+			generator.flush();
+			stream.flush();
+			
+			return stream.toByteArray();
+			
+		} catch (Exception e) {
+			LOG.warn("CBOR encoding failed: " + e.getMessage());
+		} finally {
+			try {
+				generator.close();
+				stream.close();
+			} catch (IOException e) {
+				// nothing to do
+			}
+		}
+		
+		return null;
+	}
+
+	private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+		typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
+	}
+
+	private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
+		StringBuilder builder = new StringBuilder();
+		// print total stats
+		builder.append("\n  TOTAL Stats:\n");
+		builder.append("                {\n");
+		for (String mimeType : typeCounts.keySet()) {
+			builder.append("    {\"mimeType\":\"");
+			builder.append(mimeType);
+			builder.append("\",\"count\":");
+			builder.append(typeCounts.get(mimeType));
+			builder.append("\"}\n");
+		}
+		builder.append("}\n");
+		// filtered types stats
+		if (!filteredCounts.isEmpty()) {
+			builder.append("\n  FILTERED Stats:\n");
+			builder.append("                {\n");
+			for (String mimeType : filteredCounts.keySet()) {
+				builder.append("    {\"mimeType\":\"");
+				builder.append(mimeType);
+				builder.append("\",\"count\":");
+				builder.append(filteredCounts.get(mimeType));
+				builder.append("\"}\n");
+			}
+			builder.append("}\n");
+		}
+		return builder.toString();
+	}
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+/**
+ * Interface for all CommonCrawl formatter. It provides the signature for the
+ * method used to get JSON data.
+ * 
+ * @author gtotaro
+ *
+ */
+public interface CommonCrawlFormat {
+
+	/**
+	 * 
+	 * @param mapAll If {@code true} maps all metdata on the JSON structure.
+	 * @return the JSON data
+	 */
+	public String getJsonData(boolean mapAll) throws IOException;
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a. formatter) that map crawled files to CommonCrawl format.   
+ *
+ */
+public class CommonCrawlFormatFactory {
+	
+	/**
+	 * Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter. 
+	 * @param formatType the type of formatter to be created.
+	 * @param url the url.
+	 * @param content the content.
+	 * @param metadata the metadata.
+	 * @param conf the configuration.
+	 * @return the new {@see CommonCrawlFormat} object.
+	 */
+	public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, byte[] content,
+			Metadata metadata, Configuration conf) {
+		if (formatType == null) {
+			return null;
+		}
+		
+		if (formatType.equalsIgnoreCase("jackson")) {
+			return new CommonCrawlFormatJackson(url, content, metadata, conf);
+		}
+		else if (formatType.equalsIgnoreCase("jettinson")) {
+			return new CommonCrawlFormatJettinson(url, content, metadata, conf);
+		}
+		else if (formatType.equalsIgnoreCase("simple")) {
+			return new CommonCrawlFormatSimple(url, content, metadata, conf);
+		}
+		
+		return null;
+	}
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jackson Streaming APIs. 
+ *
+ */
+public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
+
+  private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+
+  public CommonCrawlFormatJackson(String url, byte[] content,
+      Metadata metadata, Configuration conf) {
+    super(url, content, metadata, conf);
+  }
+
+  @Override
+  protected String getJsonDataAll() throws IOException {
+    JsonFactory factory = new JsonFactory();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    JsonGenerator generator = null;
+
+    try {
+      generator = factory.createGenerator(out);
+      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+      generator.writeStartObject();
+
+      // url
+      generator.writeFieldName("url");
+      generator.writeString(url);
+
+      // timestamp
+      generator.writeFieldName("timestamp");
+      generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+
+      //request
+      generator.writeFieldName("request");
+      generator.writeStartObject();
+      generator.writeFieldName("method");
+      generator.writeString("GET"); 
+      generator.writeFieldName("client");
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(getHostName());
+      generator.writeFieldName("address");
+      generator.writeString(getHostAddress());
+      generator.writeFieldName("software");
+      generator.writeString(conf.get("http.agent.version", ""));
+      generator.writeFieldName("robots");
+      generator.writeString("classic");
+      generator.writeFieldName("contact");
+      generator.writeStartObject();
+      generator.writeFieldName("name");
+      generator.writeString(conf.get("http.agent.name", ""));
+      generator.writeFieldName("email");
+      generator.writeString(conf.get("http.agent.email", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Accept");
+      generator.writeString(conf.get("accept", ""));
+      generator.writeFieldName("Accept-Encoding");
+      generator.writeString(""); // TODO
+      generator.writeFieldName("Accept-Language");
+      generator.writeString(conf.get("http.accept.language", ""));
+      generator.writeFieldName("User-Agent");
+      generator.writeString(conf.get("http.robots.agents", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("body");
+      generator.writeNull();
+      generator.writeEndObject();
+
+      //response
+      generator.writeFieldName("response");
+      generator.writeStartObject();
+      generator.writeFieldName("status");
+      generator.writeString(ifNullString(metadata.get("status")));
+      generator.writeFieldName("server");
+
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(URLUtil.getHost(url)); 
+      generator.writeFieldName("address");
+      generator.writeString(ifNullString(metadata.get("_ip_")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      for (String name : metadata.names()) {
+        generator.writeFieldName(name);
+        generator.writeString(ifNullString(metadata.get(name)));
+      }
+      generator.writeEndObject();
+
+      generator.writeFieldName("body");
+      generator.writeString(new String(content));
+      generator.writeEndObject();
+
+      generator.writeFieldName("key"); 
+      generator.writeString(url);
+
+      generator.writeFieldName("imported"); // TODO
+      generator.writeString("");
+
+      generator.writeEndObject();
+
+      generator.flush();
+
+      return out.toString();
+
+    } catch (IOException ioe) {
+      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+      throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); 
+    }
+  }
+
+  @Override
+  protected String getJsonDataSet() throws IOException {
+    JsonFactory factory = new JsonFactory();
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    JsonGenerator generator = null;
+
+    try {
+      generator = factory.createGenerator(out);
+      generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+      generator.writeStartObject();
+
+      // url
+      generator.writeFieldName("url");
+      generator.writeString(url);
+
+      // timestamp
+      generator.writeFieldName("timestamp");
+      generator.writeString(metadata.get(Metadata.LAST_MODIFIED)); 
+
+      //request
+      generator.writeFieldName("request");
+      generator.writeStartObject();
+      generator.writeFieldName("method");
+      generator.writeString("GET");
+      generator.writeFieldName("client");
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(getHostName());
+      generator.writeFieldName("address");
+      generator.writeString(getHostAddress());
+      generator.writeFieldName("software");
+      generator.writeString(conf.get("http.agent.version", ""));
+      generator.writeFieldName("robots");
+      generator.writeString("CLASSIC"); 
+      generator.writeFieldName("contact");
+      generator.writeStartObject();
+      generator.writeFieldName("name");
+      generator.writeString(conf.get("http.agent.name", ""));
+      generator.writeFieldName("email");
+      generator.writeString(conf.get("http.agent.email", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Accept");
+      generator.writeString(conf.get("accept", ""));
+      generator.writeFieldName("Accept-Encoding");
+      generator.writeString(""); // TODO
+      generator.writeFieldName("Accept-Language");
+      generator.writeString(conf.get("http.accept.language", ""));
+      generator.writeFieldName("User-Agent");
+      generator.writeString(conf.get("http.robots.agents", ""));
+      generator.writeEndObject();
+      generator.writeFieldName("body");
+      generator.writeNull();
+      generator.writeEndObject();
+
+      //response
+      generator.writeFieldName("response");
+      generator.writeStartObject();
+      generator.writeFieldName("status");
+      generator.writeString(ifNullString(metadata.get("status")));
+      generator.writeFieldName("server");
+
+      generator.writeStartObject();
+      generator.writeFieldName("hostname");
+      generator.writeString(URLUtil.getHost(url)); 
+      generator.writeFieldName("address");
+      generator.writeString(ifNullString(metadata.get("_ip_")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("headers");
+      generator.writeStartObject();
+      generator.writeFieldName("Content-Encoding");
+      generator.writeString(ifNullString(metadata.get("Content-Encoding")));
+      generator.writeFieldName("Content-Type");
+      generator.writeString(ifNullString(metadata.get("Content-Type")));
+      generator.writeFieldName("Date");
+      generator.writeString(ifNullString(metadata.get("Date")));
+      generator.writeFieldName("Server");
+      generator.writeString(ifNullString(metadata.get("Server")));
+      generator.writeEndObject();
+
+      generator.writeFieldName("body");
+      generator.writeString(new String(content));
+      generator.writeEndObject();
+
+      generator.writeFieldName("key");
+      generator.writeString(url);
+
+      generator.writeFieldName("imported"); // TODO
+      generator.writeString("");
+
+      generator.writeEndObject();
+
+      generator.flush();
+
+      return out.toString();
+
+    } catch (IOException ioe) {
+      LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+      throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage()); 
+    }
+  }
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jettinson APIs. 
+ *
+ */
+public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
+	
+	private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+
+	public CommonCrawlFormatJettinson(String url, byte[] content,
+			Metadata metadata, Configuration conf) {
+		super(url, content, metadata, conf);
+	}
+	
+	@Override
+	protected String getJsonDataAll() throws IOException {
+		JSONObject object = new JSONObject();
+
+		try {
+			// url
+			object.put("url", url);
+
+			// timestamp
+			object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
+
+			// request
+			JSONObject requestObject = new JSONObject();
+			requestObject.put("method", "GET"); 
+			JSONObject clientObject = new JSONObject();
+			clientObject.put("hostname", getHostName());
+			clientObject.put("address", getHostAddress());
+			clientObject.put("software", conf.get("http.agent.version", ""));
+			clientObject.put("robots", "CLASSIC");
+			JSONObject contactObject = new JSONObject();
+			contactObject.put("name", conf.get("http.agent.name", ""));
+			contactObject.put("email", conf.get("http.agent.email", ""));
+			clientObject.put("contact", contactObject);
+			requestObject.put("client", clientObject);
+			JSONObject reqHeadersObject = new JSONObject();
+			reqHeadersObject.put("Accept", conf.get("http.accept", ""));
+			reqHeadersObject.put("Accept-Encoding", ""); // TODO
+			reqHeadersObject.put("Accept-Language",	conf.get("http.accept.language", ""));
+			reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
+			requestObject.put("headers", reqHeadersObject);
+			requestObject.put("body", JSONObject.NULL);
+			object.put("request", requestObject);
+
+			// response
+			JSONObject responseObject = new JSONObject();
+			responseObject.put("status", ifNullString(metadata.get("status")));
+			JSONObject serverObject = new JSONObject();
+			serverObject.put("hostname", URLUtil.getHost(url));
+			serverObject.put("address", ifNullString(metadata.get("_ip_")));
+			responseObject.put("client", serverObject);
+			JSONObject respHeadersObject = new JSONObject();
+			for (String name : metadata.names()) {
+				respHeadersObject.put(name, ifNullString(metadata.get(name)));
+			}
+			responseObject.put("headers", respHeadersObject);
+			responseObject.put("body", new String(content));
+			object.put("response", responseObject);
+
+			// key
+			object.put("key", url); 
+
+			// imported
+			object.put("imported", ""); // TODO
+
+			return object.toString(2); // INDENTED OUTPUT
+
+		} catch (JSONException jsone) {
+			LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
+			throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); 
+		}
+	}
+
+	@Override
+	protected String getJsonDataSet() throws IOException {
+		JSONObject object = new JSONObject();
+
+		try {
+			// url
+			object.put("url", url);
+
+			// timestamp
+			object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
+
+			// request
+			JSONObject requestObject = new JSONObject();
+			requestObject.put("method", "GET"); 
+			JSONObject clientObject = new JSONObject();
+			clientObject.put("hostname", getHostName());
+			clientObject.put("address", getHostAddress());
+			clientObject.put("software", conf.get("http.agent.version", ""));
+			clientObject.put("robots", "CLASSIC"); 
+			JSONObject contactObject = new JSONObject();
+			contactObject.put("name", conf.get("http.agent.name", ""));
+			contactObject.put("email", conf.get("http.agent.email", ""));
+			clientObject.put("contact", contactObject);
+			requestObject.put("client", clientObject);
+			JSONObject reqHeadersObject = new JSONObject();
+			reqHeadersObject.put("Accept", conf.get("http.accept", ""));
+			reqHeadersObject.put("Accept-Encoding", ""); // TODO
+			reqHeadersObject.put("Accept-Language",	conf.get("http.accept.language", ""));
+			reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", "")); 
+			requestObject.put("headers", reqHeadersObject);
+			requestObject.put("body", JSONObject.NULL);
+			object.put("request", requestObject);
+
+			// response
+			JSONObject responseObject = new JSONObject();
+			responseObject.put("status", ifNullString(metadata.get("status")));
+			JSONObject serverObject = new JSONObject();
+			serverObject.put("hostname", URLUtil.getHost(url)); 
+			serverObject.put("address", ifNullString(metadata.get("_ip_")));
+			responseObject.put("client", serverObject);
+			JSONObject respHeadersObject = new JSONObject();
+			respHeadersObject.put("Content-Encoding", ifNullString(metadata.get("Content-Encoding")));
+			respHeadersObject.put("Content-Type", ifNullString(metadata.get("Content-Type")));
+			respHeadersObject.put("Date", ifNullString(metadata.get("Date")));
+			respHeadersObject.put("Server", ifNullString(metadata.get("Server")));
+			responseObject.put("headers", respHeadersObject);
+			responseObject.put("body", new String(content)); 
+			object.put("response", responseObject);
+
+			// key
+			object.put("key", url);
+
+			// imported
+			object.put("imported", ""); // TODO
+
+			return object.toString(2); // INDENTED OUTPUT
+
+		} catch (JSONException jsone) {
+			LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
+			throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage()); 
+		}
+	}
+}

Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Wed Mar  4 18:48:32 2015
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * This class provides methods to map crawled data on JSON using a {@see StringBuilder} object. 
+ *
+ */
+public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+	
+	public CommonCrawlFormatSimple(String url, byte[] content, Metadata metadata,
+			Configuration conf) {
+		super(url, content, metadata, conf);
+	}
+	
+	@Override
+	protected String getJsonDataAll() {
+		// TODO character escaping
+		StringBuilder sb = new StringBuilder();
+		sb.append("{\n");
+
+		// url
+		sb.append("\t\"url\": \"" + url + "\",\n");
+		
+		// timstamp
+		sb.append("\t\"timstamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+				
+		// request
+		sb.append("\t\"request\": {\n");
+		sb.append("\t\t\"method\": \"GET\",\n");
+		sb.append("\t\t\"client\": {\n");
+		sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+		sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+		sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
+		sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+		sb.append("\t\t\t\"contact\": {\n");
+		sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
+		sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
+		sb.append("\t\t\t}\n");
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"headers\": {\n");
+		sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
+		sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
+		sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
+		sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");  
+		sb.append("\t},\n");
+
+		// response
+		sb.append("\t\"response\": {\n");
+		sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
+		sb.append("\t\t\"server\": {\n");
+		sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
+		sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"headers\": {\n");	
+		for (String name : metadata.names()) {
+			sb.append("\t\t\t\"" + name + "\": \"" + metadata.get(name)	+ "\"\n");
+		}
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+		sb.append("\t},\n");
+		
+		// key
+		sb.append("\t\"key\": \"" + url + "\",\n");
+		
+		// imported
+		sb.append("\t\"imported\": \"\"\n"); //TODO
+		
+		sb.append("}");
+
+		return sb.toString();
+	}
+	
+	@Override
+	protected String getJsonDataSet() {
+		// TODO character escaping
+		StringBuilder sb = new StringBuilder();
+		sb.append("{\n");
+		
+		// url
+		sb.append("\t\"url\": \"" + url + "\",\n");
+		
+		// timstamp
+		sb.append("\t\"timestamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+		
+		// request
+		sb.append("\t\"request\": {\n");
+		sb.append("\t\t\"method\": \"GET\",\n");
+		sb.append("\t\t\"client\": {\n");
+		sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+		sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+		sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
+		sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+		sb.append("\t\t\t\"contact\": {\n");
+		sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
+		sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
+		sb.append("\t\t\t}\n");
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"headers\": {\n");
+		sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
+		sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
+		sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
+    sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");  
+		sb.append("\t},\n");
+		
+		// response
+		sb.append("\t\"response\": {\n");
+		sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
+		sb.append("\t\t\"server\": {\n");
+    sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n"); 
+		sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"headers\": {\n");
+		sb.append("\t\t\t\"Content-Encoding\": " + ifNullString(metadata.get("Content-Encoding")));
+		sb.append("\t\t\t\"Content-Type\": " + ifNullString(metadata.get("Content-Type")));
+		sb.append("\t\t\t\"Date\": " + ifNullString(metadata.get("Date")));
+		sb.append("\t\t\t\"Server\": " + ifNullString(metadata.get("Server")));
+		sb.append("\t\t},\n");
+		sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+		sb.append("\t},\n");
+		
+		// key
+		sb.append("\t\"key\": \"" + url + "\",\n"); 
+		
+		// imported
+		sb.append("\t\"imported\": \"\"\n"); // TODO
+		
+		sb.append("}");
+
+		return sb.toString();
+	}
+
+}

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Mar  4 18:48:32 2015
@@ -100,7 +100,7 @@ import org.slf4j.LoggerFactory;
  * }
  * </pre>
  * <p>
- * In the case above the tool would have been run with the <b>-mimeType
+ * In the case above, the tool would have been run with the <b>-mimeType
  * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
  * flag and corresponding values activated.
  *