You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/25 17:56:39 UTC

svn commit: r1676029 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Author: mattmann
Date: Sat Apr 25 15:56:39 2015
New Revision: 1676029

URL: http://svn.apache.org/r1676029
Log:
NUTCH-1997: Fix for Add CBOR magic header to CommonCrawlDataDumper output contributed by Giuseppe Totaro, and Luke Sh.

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 25 15:56:39 2015
@@ -2,6 +2,9 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1997 Add CBOR "magic header" to CommonCrawlDataDumper 
+  output (Giuseppe Totaro, Luke Sh via mattmann)
+
 * NUTCH-1991 Tika mime detection not using Nutch supplied tika-mimetypes.xml for content based 
   detection (Iain Lopata, snagel via mattmann)
 

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat Apr 25 15:56:39 2015
@@ -515,6 +515,31 @@ public class CommonCrawlDataDumper {
 	    tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
 	}
 	
+	/**
+	 * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as 3-byte
+	 * sequence of {@code 0xd9d9f7}) at the current position. This method must
+	 * be used to write the CBOR magic number at the beginning of the document.
+	 * Since version 2.5, <a
+	 * href="https://github.com/FasterXML/jackson-dataformat-cbor"
+	 * >jackson-dataformat-cbor</a> will support the {@code WRITE_TYPE_HEADER}
+	 * feature to write that type tag at the beginning of the document.
+	 * 
+	 * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5">RFC
+	 *      7049</a>
+	 * @param generator {@link CBORGenerator} object used to create a CBOR-encoded document.
+	 * @throws IOException if any I/O error occurs.
+	 */
+	private void writeMagicHeader(CBORGenerator generator) throws IOException {
+		// Writes self-describe CBOR
+		// https://tools.ietf.org/html/rfc7049#section-2.4.5
+		// It will be supported in jackson-cbor since 2.5
+		byte[] header = new byte[3];
+		header[0] = (byte) 0xd9;
+		header[1] = (byte) 0xd9;
+		header[2] = (byte) 0xf7;
+		generator.writeBytes(header, 0, header.length);
+	}
+	
 	private byte[] serializeCBORData(String jsonData) {
 		CBORFactory factory = new CBORFactory();
 		
@@ -524,6 +549,8 @@ public class CommonCrawlDataDumper {
 		try {
 			stream = new ByteArrayOutputStream();
 			generator = factory.createGenerator(stream);
+			// Writes CBOR tag
+			writeMagicHeader(generator);
 			generator.writeString(jsonData);
 			generator.flush();
 			stream.flush();