You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/03/04 19:48:32 UTC
svn commit: r1664109 - in /nutch/trunk: ./ ivy/ src/bin/
src/java/org/apache/nutch/tools/
Author: lewismc
Date: Wed Mar 4 18:48:32 2015
New Revision: 1664109
URL: http://svn.apache.org/r1664109
Log:
NUTCH-1949 Dump out the Nutch data into the Common Crawl format
Added:
nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/bin/nutch
nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 4 18:48:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1949 Dump out the Nutch data into the Common Crawl format (Giuseppe Totaro via lewismc)
+
* NUTCH-1950 File name too long (Jiaheng Zhang, Chong Li via mattmann)
* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Wed Mar 4 18:48:32 2015
@@ -49,7 +49,8 @@
rev="3.1" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.3"
conf="*->default" />
-
+ <dependency org="org.apache.commons" name="commons-compress" rev="1.9"
+ conf="*->default" />
<dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0"
conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
@@ -70,6 +71,9 @@
<dependency org="com.google.guava" name="guava" rev="11.0.2" />
<dependency org="com.google.code.crawler-commons" name="crawler-commons"
rev="0.5" />
+
+ <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" />
+ <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" />
<!--Configuration: test -->
Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Wed Mar 4 18:48:32 2015
@@ -71,7 +71,8 @@ if [ $# = 0 ]; then
echo " mergelinkdb merge linkdb-s, with optional filtering"
echo " index run the plugin-based indexer on parsed segments and linkdb"
echo " dedup deduplicate entries in the crawldb and give them a special status"
- echo " dump exports cralwed data from segments into files"
+ echo " dump exports crawled data from segments into files"
+ echo " commoncrawldump exports crawled data from segments into common crawl data format encoded as CBOR"
echo " solrindex run the solr indexer on parsed segments and linkdb - DEPRECATED use the index command instead"
echo " solrdedup remove duplicates from solr - DEPRECATED use the dedup command instead"
echo " solrclean remove HTTP 301 and 404 documents from solr - DEPRECATED use the clean command instead"
@@ -233,6 +234,8 @@ elif [ "$COMMAND" = "mergelinkdb" ] ; th
CLASS=org.apache.nutch.crawl.LinkDbMerger
elif [ "$COMMAND" = "dump" ] ; then
CLASS=org.apache.nutch.tools.FileDumper
+elif [ "$COMMAND" = "commoncrawldump" ] ; then
+ CLASS=org.apache.nutch.tools.CommonCrawlDataDumper
elif [ "$COMMAND" = "solrindex" ] ; then
CLASS="org.apache.nutch.indexer.IndexingJob -D solr.server.url=$1"
shift
Added: nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Abstract class that implements {@see CommonCrawlFormat} interface.
+ *
+ */
+public abstract class AbstractCommonCrawlFormat implements CommonCrawlFormat {
+ protected String url;
+
+ protected byte[] content;
+
+ protected Metadata metadata;
+
+ protected Configuration conf;
+
+ public AbstractCommonCrawlFormat(String url, byte[] content, Metadata metadata, Configuration conf) {
+ this.url = url;
+ this.content = content;
+ this.metadata = metadata;
+ this.conf = conf;
+ }
+
+ @Override
+ public String getJsonData(boolean mapAll) throws IOException {
+ if (mapAll) {
+ return getJsonDataAll();
+ }
+ else {
+ return getJsonDataSet();
+ }
+ }
+
+ protected abstract String getJsonDataSet() throws IOException;
+
+ protected abstract String getJsonDataAll() throws IOException;
+
+ protected String ifNullString(String value) {
+ return (value != null) ? value : "";
+ }
+
+ protected static String getHostName() {
+ String hostName = "";
+ try {
+ hostName = InetAddress.getLocalHost().getHostName();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostName;
+ }
+
+ protected static String getHostAddress() {
+ String hostAddress = "";
+ try {
+ hostAddress = InetAddress.getLocalHost().getHostAddress();
+ } catch (UnknownHostException uhe) {
+
+ }
+ return hostAddress;
+ }
+}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Wed Mar 4 18:48:32 2015
@@ -53,6 +53,7 @@ public class Benchmark extends Configure
System.exit(res);
}
+ @SuppressWarnings("unused")
private static String getDate() {
return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
.currentTimeMillis()));
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,470 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+//JDK imports
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+//Commons imports
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.FilenameUtils;
+
+//Hadoop
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+//Tika imports
+import org.apache.tika.Tika;
+import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.ibm.icu.text.SimpleDateFormat;
+
+/**
+ * <p>
+ * The Common Crawl Data Dumper tool enables one to reverse generate the raw
+ * content from Nutch segment data directories into a common crawling data
+ * format, consumed by many applications. The data is then serialized as <a
+ * href="http://cbor.io">CBOR</a>
+ * </p>
+ * <p>
+ * Text content will be stored in a structured document format. Below is a
+ * schema for storage of data and metadata related to a crawling request, with
+ * the response body truncated for readability. This document must be encoded
+ * using CBOR and should be compressed with gzip after encoding. The timestamped
+ * URL key for these records' keys follows the same layout as the media file
+ * directory structure, with underscores in place of directory separators. </li>
+ * </p>
+ * <p>
+ * Thus, the timestamped url key for the record is provided below followed by an
+ * example record:
+ *
+ * <pre>
+ * {@code
+ * com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000
+ *
+ * {
+ * "url": "http:\/\/somepage.com\/22\/14560817",
+ * "timestamp": "1411623696000",
+ * "request": {
+ * "method": "GET",
+ * "client": {
+ * "hostname": "crawler01.local",
+ * "address": "74.347.129.200",
+ * "software": "Apache Nutch v1.10",
+ * "robots": "classic",
+ * "contact": {
+ * "name": "Nutch Admin",
+ * "email": "nutch.pro@nutchadmin.org"
+ * }
+ * },
+ * "headers": {
+ * "Accept": "text\/html,application\/xhtml+xml,application\/xml",
+ * "Accept-Encoding": "gzip,deflate,sdch",
+ * "Accept-Language": "en-US,en",
+ * "User-Agent": "Mozilla\/5.0",
+ * "...": "..."
+ * },
+ * "body": null
+ * },
+ * "response": {
+ * "status": "200",
+ * "server": {
+ * "hostname": "somepage.com",
+ * "address": "55.33.51.19",
+ * },
+ * "headers": {
+ * "Content-Encoding": "gzip",
+ * "Content-Type": "text\/html",
+ * "Date": "Thu, 25 Sep 2014 04:16:58 GMT",
+ * "Expires": "Thu, 25 Sep 2014 04:16:57 GMT",
+ * "Server": "nginx",
+ * "...": "..."
+ * },
+ * "body": "\r\n <!DOCTYPE html PUBLIC ... \r\n\r\n \r\n </body>\r\n </html>\r\n \r\n\r\n",
+ * },
+ * "key": "com_somepage_33a3e36bbef59c2a5242c2ccee59239ab30d51f3_1411623696000",
+ * "imported": "1411623698000"
+ * }
+ * }
+ * </pre>
+ *
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which fall
+ * into those classifications. An example is as follows:
+ * </p>
+ *
+ * <pre>
+ * {@code
+ * INFO: File Types:
+ * TOTAL Stats: {
+ * {"mimeType":"application/xml","count":19"}
+ * {"mimeType":"image/png","count":47"}
+ * {"mimeType":"image/jpeg","count":141"}
+ * {"mimeType":"image/vnd.microsoft.icon","count":4"}
+ * {"mimeType":"text/plain","count":89"}
+ * {"mimeType":"video/quicktime","count":2"}
+ * {"mimeType":"image/gif","count":63"}
+ * {"mimeType":"application/xhtml+xml","count":1670"}
+ * {"mimeType":"application/octet-stream","count":40"}
+ * {"mimeType":"text/html","count":1863"}
+ * }
+ * }
+ * </pre>
+ *
+ */
+public class CommonCrawlDataDumper {
+
+ private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlDataDumper.class.getName());
+
+ /**
+ * Main method for invoking this tool
+ *
+ * @param args
+ * 1) output directory (which will be created if it does not
+ * already exist) to host the CBOR data and 2) a directory
+ * containing one or more segments from which we wish to generate
+ * CBOR data from. Optionally, 3) a list of mimetypes and the 4)
+ * the gzip option may be provided.
+ * @throws Exception
+ */
+ @SuppressWarnings("static-access")
+ public static void main(String[] args) throws Exception {
+ Option helpOpt = new Option("h", "help", false,
+ "show this help message");
+ // argument options
+ Option outputOpt = OptionBuilder
+ .withArgName("outputDir")
+ .hasArg()
+ .withDescription(
+ "output directory (which will be created) to host the CBOR data")
+ .create("outputDir");
+ Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+ .withDescription("the segment(s) to use").create("segment");
+ // GIUSEPPE: create mimetype and gzip options
+ Option mimeOpt = OptionBuilder
+ .isRequired(false)
+ .withArgName("mimetype")
+ .hasArgs()
+ .withDescription(
+ "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+ .create("mimetype");
+ Option gzipOpt = OptionBuilder
+ .isRequired(false)
+ .hasArg(false)
+ .withDescription(
+ "an optional flag indicating whether to additionally gzip the data")
+ .create("gzip");
+
+ // create the options
+ Options options = new Options();
+ options.addOption(helpOpt);
+ options.addOption(outputOpt);
+ options.addOption(segOpt);
+ // create mimetypes and gzip options
+ options.addOption(mimeOpt);
+ options.addOption(gzipOpt);
+
+ CommandLineParser parser = new GnuParser();
+ try {
+ CommandLine line = parser.parse(options, args);
+ if (line.hasOption("help") || !line.hasOption("outputDir") || (!line.hasOption("segment"))) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp(CommonCrawlDataDumper.class.getName(), options, true);
+ return;
+ }
+
+ File outputDir = new File(line.getOptionValue("outputDir"));
+ File segmentRootDir = new File(line.getOptionValue("segment"));
+ String[] mimeTypes = line.getOptionValues("mimetype");
+ boolean gzip = line.hasOption("gzip");
+
+ if (!outputDir.exists()) {
+ LOG.warn("Output directory: [" + outputDir.getAbsolutePath() + "]: does not exist, creating it.");
+ if (!outputDir.mkdirs())
+ throw new Exception("Unable to create: [" + outputDir.getAbsolutePath() + "]");
+ }
+
+ CommonCrawlDataDumper dumper = new CommonCrawlDataDumper();
+
+ dumper.dump(outputDir, segmentRootDir, gzip, mimeTypes);
+
+ } catch (Exception e) {
+ LOG.error(CommonCrawlDataDumper.class.getName() + ": " + StringUtils.stringifyException(e));
+ e.printStackTrace();
+ return;
+ }
+ }
+
+ /**
+ * Dumps the reverse engineered CBOR content from the provided segment
+ * directories if a parent directory contains more than one segment,
+ * otherwise a single segment can be passed as an argument. If the boolean
+ * argument is provided then the CBOR is also zipped.
+ *
+ * @param outputDir
+ * the directory you wish to dump the raw content to. This
+ * directory will be created.
+ * @param segmentRootDir
+ * a directory containing one or more segments.
+ * @param gzip
+ * a boolean flag indicating whether the CBOR content should also
+ * be gzipped.
+ * @param mimetypes
+ * an array of mime types we have to dump, all others will be
+ * filtered out.
+ * @throws Exception
+ */
+ public void dump(File outputDir, File segmentRootDir, boolean gzip, String[] mimeTypes) throws Exception {
+ if (!gzip) {
+ LOG.info("Gzipping CBOR data has been skipped");
+ }
+ // total file counts
+ Map<String, Integer> typeCounts = new HashMap<String, Integer>();
+ // filtered file counters
+ Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
+
+ Configuration conf = NutchConfiguration.create();
+ FileSystem fs = FileSystem.get(conf);
+ File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
+ @Override
+ public boolean accept(File file) {
+ return file.canRead() && file.isDirectory();
+ }
+ });
+
+ if (segmentDirs == null) {
+ LOG.error("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
+ System.exit(1);
+ }
+
+ // Gzip initialization
+ FileOutputStream fileOutput = null;
+ BufferedOutputStream bufOutput = null;
+ GzipCompressorOutputStream gzipOutput = null;
+ TarArchiveOutputStream tarOutput = null;
+
+ ArrayList<String> fileList = null;
+
+ if (gzip) {
+ String archiveName = new SimpleDateFormat("yyyyMMddhhmm'.tar.gz'").format(new Date());
+ fileOutput = new FileOutputStream(new File(outputDir + File.separator + archiveName));
+ bufOutput = new BufferedOutputStream(fileOutput);
+ gzipOutput = new GzipCompressorOutputStream(bufOutput);
+ tarOutput = new TarArchiveOutputStream(gzipOutput);
+
+ fileList = new ArrayList<String>();
+ }
+
+ for (File segment : segmentDirs) {
+ LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
+ // GIUSEPPE: Never used (also in FileDumper.java)!
+ //DataOutputStream doutputStream = null;
+ try {
+ String segmentContentPath = segment.getAbsolutePath() + File.separator + Content.DIR_NAME + "/part-00000/data";
+ Path file = new Path(segmentContentPath);
+
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present");
+ continue;
+ }
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
+
+ if (!new File(file.toString()).exists()) {
+ LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present");
+ continue;
+ }
+ Writable key = (Writable) reader.getKeyClass().newInstance();
+
+ Content content = null;
+
+ while (reader.next(key)) {
+ content = new Content();
+ reader.getCurrentValue(content);
+ String url = key.toString();
+ String baseName = FilenameUtils.getBaseName(url);
+ String extension = FilenameUtils.getExtension(url);
+ if (extension == null || extension.equals("")) {
+ extension = "html";
+ }
+
+ String filename = baseName + "." + extension;
+
+ // Encode all filetypes if no mimetypes have been given
+ Boolean filter = (mimeTypes == null);
+
+ String jsonData = "";
+ try {
+ String mimeType = new Tika().detect(content.getContent());
+ // Maps file to JSON-based structure
+ CommonCrawlFormat format = CommonCrawlFormatFactory.getCommonCrawlFormat("JACKSON", url, content.getContent(), content.getMetadata(), conf);
+ jsonData = format.getJsonData(false);
+
+ collectStats(typeCounts, mimeType);
+ // collects statistics for the given mimetypes
+ if ((mimeType != null) && (mimeTypes != null) && Arrays.asList(mimeTypes).contains(mimeType)) {
+ collectStats(filteredCounts, mimeType);
+ filter = true;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ LOG.warn("Tika is unable to detect type for: [" + url
+ + "]");
+ }
+
+ if (filter) {
+
+ byte[] byteData = serializeCBORData(jsonData);
+
+ if (!gzip) {
+ String outputFullPath = outputDir + File.separator + filename;
+ File outputFile = new File(outputFullPath);
+ if (outputFile.exists()) {
+ LOG.info("Skipping writing: [" + outputFullPath + "]: file already exists");
+ }
+ else {
+ LOG.info("Writing: [" + outputFullPath + "]");
+ IOUtils.copy(new ByteArrayInputStream(byteData), new FileOutputStream(outputFile));
+ }
+ }
+ else {
+ if (fileList.contains(filename)) {
+ LOG.info("Skipping compressing: [" + filename + "]: file already exists");
+ }
+ else {
+ fileList.add(filename);
+ LOG.info("Compressing: [" + filename + "]");
+ TarArchiveEntry tarEntry = new TarArchiveEntry(filename);
+ tarEntry.setSize(byteData.length);
+ tarOutput.putArchiveEntry(tarEntry);
+ IOUtils.copy(new ByteArrayInputStream(byteData), tarOutput);
+ tarOutput.closeArchiveEntry();
+ }
+ }
+ }
+ }
+ reader.close();
+ } finally {
+ fs.close();
+ }
+ }
+
+ if (gzip) {
+ tarOutput.finish();
+
+ tarOutput.close();
+ gzipOutput.close();
+ bufOutput.close();
+ fileOutput.close();
+ }
+
+ LOG.info("CommonsCrawlDataDumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+ }
+
+ private byte[] serializeCBORData(String jsonData) {
+ CBORFactory factory = new CBORFactory();
+
+ CBORGenerator generator = null;
+ ByteArrayOutputStream stream = null;
+
+ try {
+ stream = new ByteArrayOutputStream();
+ generator = factory.createGenerator(stream);
+ generator.writeString(jsonData);
+ generator.flush();
+ stream.flush();
+
+ return stream.toByteArray();
+
+ } catch (Exception e) {
+ LOG.warn("CBOR encoding failed: " + e.getMessage());
+ } finally {
+ try {
+ generator.close();
+ stream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+
+ return null;
+ }
+
+ private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
+ typeCounts.put(mimeType, typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
+ }
+
+ private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
+ StringBuilder builder = new StringBuilder();
+ // print total stats
+ builder.append("\n TOTAL Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : typeCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(typeCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
+ // filtered types stats
+ if (!filteredCounts.isEmpty()) {
+ builder.append("\n FILTERED Stats:\n");
+ builder.append(" {\n");
+ for (String mimeType : filteredCounts.keySet()) {
+ builder.append(" {\"mimeType\":\"");
+ builder.append(mimeType);
+ builder.append("\",\"count\":");
+ builder.append(filteredCounts.get(mimeType));
+ builder.append("\"}\n");
+ }
+ builder.append("}\n");
+ }
+ return builder.toString();
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormat.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+/**
+ * Interface for all CommonCrawl formatter. It provides the signature for the
+ * method used to get JSON data.
+ *
+ * @author gtotaro
+ *
+ */
+public interface CommonCrawlFormat {
+
+ /**
+ *
+ * @param mapAll If {@code true} maps all metdata on the JSON structure.
+ * @return the JSON data
+ */
+ public String getJsonData(boolean mapAll) throws IOException;
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * Factory class that creates new {@see CommonCrawlFormat} objects (a.k.a. formatter) that map crawled files to CommonCrawl format.
+ *
+ */
+public class CommonCrawlFormatFactory {
+
+ /**
+ * Returns a new instance of a {@see CommonCrawlFormat} object specifying the type of formatter.
+ * @param formatType the type of formatter to be created.
+ * @param url the url.
+ * @param content the content.
+ * @param metadata the metadata.
+ * @param conf the configuration.
+ * @return the new {@see CommonCrawlFormat} object.
+ */
+ public static CommonCrawlFormat getCommonCrawlFormat(String formatType, String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ if (formatType == null) {
+ return null;
+ }
+
+ if (formatType.equalsIgnoreCase("jackson")) {
+ return new CommonCrawlFormatJackson(url, content, metadata, conf);
+ }
+ else if (formatType.equalsIgnoreCase("jettinson")) {
+ return new CommonCrawlFormatJettinson(url, content, metadata, conf);
+ }
+ else if (formatType.equalsIgnoreCase("simple")) {
+ return new CommonCrawlFormatSimple(url, content, metadata, conf);
+ }
+
+ return null;
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,253 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonGenerator;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jackson Streaming APIs.
+ *
+ */
+public class CommonCrawlFormatJackson extends AbstractCommonCrawlFormat {
+
+ private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJackson.class.getName());
+
+ public CommonCrawlFormatJackson(String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() throws IOException {
+ JsonFactory factory = new JsonFactory();
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ JsonGenerator generator = null;
+
+ try {
+ generator = factory.createGenerator(out);
+ generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+ generator.writeStartObject();
+
+ // url
+ generator.writeFieldName("url");
+ generator.writeString(url);
+
+ // timestamp
+ generator.writeFieldName("timestamp");
+ generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+
+ //request
+ generator.writeFieldName("request");
+ generator.writeStartObject();
+ generator.writeFieldName("method");
+ generator.writeString("GET");
+ generator.writeFieldName("client");
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(getHostName());
+ generator.writeFieldName("address");
+ generator.writeString(getHostAddress());
+ generator.writeFieldName("software");
+ generator.writeString(conf.get("http.agent.version", ""));
+ generator.writeFieldName("robots");
+ generator.writeString("classic");
+ generator.writeFieldName("contact");
+ generator.writeStartObject();
+ generator.writeFieldName("name");
+ generator.writeString(conf.get("http.agent.name", ""));
+ generator.writeFieldName("email");
+ generator.writeString(conf.get("http.agent.email", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Accept");
+ generator.writeString(conf.get("accept", ""));
+ generator.writeFieldName("Accept-Encoding");
+ generator.writeString(""); // TODO
+ generator.writeFieldName("Accept-Language");
+ generator.writeString(conf.get("http.accept.language", ""));
+ generator.writeFieldName("User-Agent");
+ generator.writeString(conf.get("http.robots.agents", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("body");
+ generator.writeNull();
+ generator.writeEndObject();
+
+ //response
+ generator.writeFieldName("response");
+ generator.writeStartObject();
+ generator.writeFieldName("status");
+ generator.writeString(ifNullString(metadata.get("status")));
+ generator.writeFieldName("server");
+
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(URLUtil.getHost(url));
+ generator.writeFieldName("address");
+ generator.writeString(ifNullString(metadata.get("_ip_")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ for (String name : metadata.names()) {
+ generator.writeFieldName(name);
+ generator.writeString(ifNullString(metadata.get(name)));
+ }
+ generator.writeEndObject();
+
+ generator.writeFieldName("body");
+ generator.writeString(new String(content));
+ generator.writeEndObject();
+
+ generator.writeFieldName("key");
+ generator.writeString(url);
+
+ generator.writeFieldName("imported"); // TODO
+ generator.writeString("");
+
+ generator.writeEndObject();
+
+ generator.flush();
+
+ return out.toString();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage());
+ }
+ }
+
+ @Override
+ protected String getJsonDataSet() throws IOException {
+ JsonFactory factory = new JsonFactory();
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ JsonGenerator generator = null;
+
+ try {
+ generator = factory.createGenerator(out);
+ generator.useDefaultPrettyPrinter(); // INDENTED OUTPUT
+
+ generator.writeStartObject();
+
+ // url
+ generator.writeFieldName("url");
+ generator.writeString(url);
+
+ // timestamp
+ generator.writeFieldName("timestamp");
+ generator.writeString(metadata.get(Metadata.LAST_MODIFIED));
+
+ //request
+ generator.writeFieldName("request");
+ generator.writeStartObject();
+ generator.writeFieldName("method");
+ generator.writeString("GET");
+ generator.writeFieldName("client");
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(getHostName());
+ generator.writeFieldName("address");
+ generator.writeString(getHostAddress());
+ generator.writeFieldName("software");
+ generator.writeString(conf.get("http.agent.version", ""));
+ generator.writeFieldName("robots");
+ generator.writeString("CLASSIC");
+ generator.writeFieldName("contact");
+ generator.writeStartObject();
+ generator.writeFieldName("name");
+ generator.writeString(conf.get("http.agent.name", ""));
+ generator.writeFieldName("email");
+ generator.writeString(conf.get("http.agent.email", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Accept");
+ generator.writeString(conf.get("accept", ""));
+ generator.writeFieldName("Accept-Encoding");
+ generator.writeString(""); // TODO
+ generator.writeFieldName("Accept-Language");
+ generator.writeString(conf.get("http.accept.language", ""));
+ generator.writeFieldName("User-Agent");
+ generator.writeString(conf.get("http.robots.agents", ""));
+ generator.writeEndObject();
+ generator.writeFieldName("body");
+ generator.writeNull();
+ generator.writeEndObject();
+
+ //response
+ generator.writeFieldName("response");
+ generator.writeStartObject();
+ generator.writeFieldName("status");
+ generator.writeString(ifNullString(metadata.get("status")));
+ generator.writeFieldName("server");
+
+ generator.writeStartObject();
+ generator.writeFieldName("hostname");
+ generator.writeString(URLUtil.getHost(url));
+ generator.writeFieldName("address");
+ generator.writeString(ifNullString(metadata.get("_ip_")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("headers");
+ generator.writeStartObject();
+ generator.writeFieldName("Content-Encoding");
+ generator.writeString(ifNullString(metadata.get("Content-Encoding")));
+ generator.writeFieldName("Content-Type");
+ generator.writeString(ifNullString(metadata.get("Content-Type")));
+ generator.writeFieldName("Date");
+ generator.writeString(ifNullString(metadata.get("Date")));
+ generator.writeFieldName("Server");
+ generator.writeString(ifNullString(metadata.get("Server")));
+ generator.writeEndObject();
+
+ generator.writeFieldName("body");
+ generator.writeString(new String(content));
+ generator.writeEndObject();
+
+ generator.writeFieldName("key");
+ generator.writeString(url);
+
+ generator.writeFieldName("imported"); // TODO
+ generator.writeString("");
+
+ generator.writeEndObject();
+
+ generator.flush();
+
+ return out.toString();
+
+ } catch (IOException ioe) {
+ LOG.warn("Error in processing file " + url + ": " + ioe.getMessage());
+ throw new IOException("Error in generating JSON using Jackson:" + ioe.getMessage());
+ }
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+import org.codehaus.jettison.json.JSONException;
+import org.codehaus.jettison.json.JSONObject;
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides methods to map crawled data on JSON using Jettinson APIs.
+ *
+ */
+public class CommonCrawlFormatJettinson extends AbstractCommonCrawlFormat {
+
+ private static final Logger LOG = LoggerFactory.getLogger(CommonCrawlFormatJettinson.class.getName());
+
+ public CommonCrawlFormatJettinson(String url, byte[] content,
+ Metadata metadata, Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() throws IOException {
+ JSONObject object = new JSONObject();
+
+ try {
+ // url
+ object.put("url", url);
+
+ // timestamp
+ object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
+
+ // request
+ JSONObject requestObject = new JSONObject();
+ requestObject.put("method", "GET");
+ JSONObject clientObject = new JSONObject();
+ clientObject.put("hostname", getHostName());
+ clientObject.put("address", getHostAddress());
+ clientObject.put("software", conf.get("http.agent.version", ""));
+ clientObject.put("robots", "CLASSIC");
+ JSONObject contactObject = new JSONObject();
+ contactObject.put("name", conf.get("http.agent.name", ""));
+ contactObject.put("email", conf.get("http.agent.email", ""));
+ clientObject.put("contact", contactObject);
+ requestObject.put("client", clientObject);
+ JSONObject reqHeadersObject = new JSONObject();
+ reqHeadersObject.put("Accept", conf.get("http.accept", ""));
+ reqHeadersObject.put("Accept-Encoding", ""); // TODO
+ reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", ""));
+ reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
+ requestObject.put("headers", reqHeadersObject);
+ requestObject.put("body", JSONObject.NULL);
+ object.put("request", requestObject);
+
+ // response
+ JSONObject responseObject = new JSONObject();
+ responseObject.put("status", ifNullString(metadata.get("status")));
+ JSONObject serverObject = new JSONObject();
+ serverObject.put("hostname", URLUtil.getHost(url));
+ serverObject.put("address", ifNullString(metadata.get("_ip_")));
+ responseObject.put("client", serverObject);
+ JSONObject respHeadersObject = new JSONObject();
+ for (String name : metadata.names()) {
+ respHeadersObject.put(name, ifNullString(metadata.get(name)));
+ }
+ responseObject.put("headers", respHeadersObject);
+ responseObject.put("body", new String(content));
+ object.put("response", responseObject);
+
+ // key
+ object.put("key", url);
+
+ // imported
+ object.put("imported", ""); // TODO
+
+ return object.toString(2); // INDENTED OUTPUT
+
+ } catch (JSONException jsone) {
+ LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
+ throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage());
+ }
+ }
+
+ @Override
+ protected String getJsonDataSet() throws IOException {
+ JSONObject object = new JSONObject();
+
+ try {
+ // url
+ object.put("url", url);
+
+ // timestamp
+ object.put("timestamp", metadata.get(Metadata.LAST_MODIFIED));
+
+ // request
+ JSONObject requestObject = new JSONObject();
+ requestObject.put("method", "GET");
+ JSONObject clientObject = new JSONObject();
+ clientObject.put("hostname", getHostName());
+ clientObject.put("address", getHostAddress());
+ clientObject.put("software", conf.get("http.agent.version", ""));
+ clientObject.put("robots", "CLASSIC");
+ JSONObject contactObject = new JSONObject();
+ contactObject.put("name", conf.get("http.agent.name", ""));
+ contactObject.put("email", conf.get("http.agent.email", ""));
+ clientObject.put("contact", contactObject);
+ requestObject.put("client", clientObject);
+ JSONObject reqHeadersObject = new JSONObject();
+ reqHeadersObject.put("Accept", conf.get("http.accept", ""));
+ reqHeadersObject.put("Accept-Encoding", ""); // TODO
+ reqHeadersObject.put("Accept-Language", conf.get("http.accept.language", ""));
+ reqHeadersObject.put("User-Agent", conf.get("http.robots.agents", ""));
+ requestObject.put("headers", reqHeadersObject);
+ requestObject.put("body", JSONObject.NULL);
+ object.put("request", requestObject);
+
+ // response
+ JSONObject responseObject = new JSONObject();
+ responseObject.put("status", ifNullString(metadata.get("status")));
+ JSONObject serverObject = new JSONObject();
+ serverObject.put("hostname", URLUtil.getHost(url));
+ serverObject.put("address", ifNullString(metadata.get("_ip_")));
+ responseObject.put("client", serverObject);
+ JSONObject respHeadersObject = new JSONObject();
+ respHeadersObject.put("Content-Encoding", ifNullString(metadata.get("Content-Encoding")));
+ respHeadersObject.put("Content-Type", ifNullString(metadata.get("Content-Type")));
+ respHeadersObject.put("Date", ifNullString(metadata.get("Date")));
+ respHeadersObject.put("Server", ifNullString(metadata.get("Server")));
+ responseObject.put("headers", respHeadersObject);
+ responseObject.put("body", new String(content));
+ object.put("response", responseObject);
+
+ // key
+ object.put("key", url);
+
+ // imported
+ object.put("imported", ""); // TODO
+
+ return object.toString(2); // INDENTED OUTPUT
+
+ } catch (JSONException jsone) {
+ LOG.warn("Error in processing file " + url + ": " + jsone.getMessage());
+ throw new IOException("Error in generating JSON using Jettinson:" + jsone.getMessage());
+ }
+ }
+}
Added: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java?rev=1664109&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java Wed Mar 4 18:48:32 2015
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * This class provides methods to map crawled data on JSON using a {@see StringBuilder} object.
+ *
+ */
+public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
+
+ public CommonCrawlFormatSimple(String url, byte[] content, Metadata metadata,
+ Configuration conf) {
+ super(url, content, metadata, conf);
+ }
+
+ @Override
+ protected String getJsonDataAll() {
+ // TODO character escaping
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+
+ // url
+ sb.append("\t\"url\": \"" + url + "\",\n");
+
+ // timstamp
+ sb.append("\t\"timstamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+
+ // request
+ sb.append("\t\"request\": {\n");
+ sb.append("\t\t\"method\": \"GET\",\n");
+ sb.append("\t\t\"client\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+ sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+ sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
+ sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+ sb.append("\t\t\t\"contact\": {\n");
+ sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
+ sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
+ sb.append("\t\t\t}\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
+ sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); //TODO
+ sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
+ sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");
+ sb.append("\t},\n");
+
+ // response
+ sb.append("\t\"response\": {\n");
+ sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
+ sb.append("\t\t\"server\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n");
+ sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ for (String name : metadata.names()) {
+ sb.append("\t\t\t\"" + name + "\": \"" + metadata.get(name) + "\"\n");
+ }
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+ sb.append("\t},\n");
+
+ // key
+ sb.append("\t\"key\": \"" + url + "\",\n");
+
+ // imported
+ sb.append("\t\"imported\": \"\"\n"); //TODO
+
+ sb.append("}");
+
+ return sb.toString();
+ }
+
+ @Override
+ protected String getJsonDataSet() {
+ // TODO character escaping
+ StringBuilder sb = new StringBuilder();
+ sb.append("{\n");
+
+ // url
+ sb.append("\t\"url\": \"" + url + "\",\n");
+
+ // timstamp
+ sb.append("\t\"timestamp\": \"" + metadata.get(Metadata.LAST_MODIFIED) + "\",\n");
+
+ // request
+ sb.append("\t\"request\": {\n");
+ sb.append("\t\t\"method\": \"GET\",\n");
+ sb.append("\t\t\"client\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + getHostName() + "\",\n");
+ sb.append("\t\t\t\"address\": \"" + getHostAddress() + "\",\n");
+ sb.append("\t\t\t\"software\": \"" + conf.get("http.agent.version", "") + "\",\n");
+ sb.append("\t\t\t\"robots\": \"CLASSIC\",\n");
+ sb.append("\t\t\t\"contact\": {\n");
+ sb.append("\t\t\t\t\"name\": \"" + conf.get("http.agent.name", "") + "\",\n");
+ sb.append("\t\t\t\t\"email\": \"" + conf.get("http.agent.email", "") + "\",\n");
+ sb.append("\t\t\t}\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Accept\": \"" + conf.get("http.accept", "") + "\",\n");
+ sb.append("\t\t\t\"Accept-Encoding\": \"\",\n"); // TODO
+ sb.append("\t\t\t\"Accept-Language\": \"" + conf.get("http.accept.language", "") + "\",\n");
+ sb.append("\t\t\t\"User-Agent\": \"" + conf.get("http.robots.agents", "") + "\",\n");
+ sb.append("\t},\n");
+
+ // response
+ sb.append("\t\"response\": {\n");
+ sb.append("\t\t\"status\": \"" + ifNullString(metadata.get("status")) + "\",\n");
+ sb.append("\t\t\"server\": {\n");
+ sb.append("\t\t\t\"hostname\": \"" + URLUtil.getHost(url) + "\"\n");
+ sb.append("\t\t\t\"address\": \"" + metadata.get("_ip_") + "\"\n");
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"headers\": {\n");
+ sb.append("\t\t\t\"Content-Encoding\": " + ifNullString(metadata.get("Content-Encoding")));
+ sb.append("\t\t\t\"Content-Type\": " + ifNullString(metadata.get("Content-Type")));
+ sb.append("\t\t\t\"Date\": " + ifNullString(metadata.get("Date")));
+ sb.append("\t\t\t\"Server\": " + ifNullString(metadata.get("Server")));
+ sb.append("\t\t},\n");
+ sb.append("\t\t\"body\": " + new String(content) + "\",\n");
+ sb.append("\t},\n");
+
+ // key
+ sb.append("\t\"key\": \"" + url + "\",\n");
+
+ // imported
+ sb.append("\t\"imported\": \"\"\n"); // TODO
+
+ sb.append("}");
+
+ return sb.toString();
+ }
+
+}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1664109&r1=1664108&r2=1664109&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Mar 4 18:48:32 2015
@@ -100,7 +100,7 @@ import org.slf4j.LoggerFactory;
* }
* </pre>
* <p>
- * In the case above the tool would have been run with the <b>-mimeType
+ * In the case above, the tool would have been run with the <b>-mimeType
* image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
* flag and corresponding values activated.
*