You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/25 17:56:39 UTC
svn commit: r1676029 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Author: mattmann
Date: Sat Apr 25 15:56:39 2015
New Revision: 1676029
URL: http://svn.apache.org/r1676029
Log:
NUTCH-1997: Fix for Add CBOR magic header to CommonCrawlDataDumper output contributed by Giuseppe Totaro, and Luke Sh.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 25 15:56:39 2015
@@ -2,6 +2,9 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1997 Add CBOR "magic header" to CommonCrawlDataDumper
+ output (Giuseppe Totaro, Luke Sh via mattmann)
+
* NUTCH-1991 Tika mime detection not using Nutch supplied tika-mimetypes.xml for content based
detection (Iain Lopata, snagel via mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1676029&r1=1676028&r2=1676029&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Sat Apr 25 15:56:39 2015
@@ -515,6 +515,31 @@ public class CommonCrawlDataDumper {
tarOutput.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
}
+ /**
+ * Writes the CBOR "Self-Describe Tag" (value 55799, serialized as 3-byte
+ * sequence of {@code 0xd9d9f7}) at the current position. This method must
+ * be used to write the CBOR magic number at the beginning of the document.
+ * Since version 2.5, <a
+ * href="https://github.com/FasterXML/jackson-dataformat-cbor"
+ * >jackson-dataformat-cbor</a> will support the {@code WRITE_TYPE_HEADER}
+ * feature to write that type tag at the beginning of the document.
+ *
+ * @see <a href="https://tools.ietf.org/html/rfc7049#section-2.4.5">RFC
+ * 7049</a>
+ * @param generator {@link CBORGenerator} object used to create a CBOR-encoded document.
+ * @throws IOException if any I/O error occurs.
+ */
+ private void writeMagicHeader(CBORGenerator generator) throws IOException {
+ // Writes self-describe CBOR
+ // https://tools.ietf.org/html/rfc7049#section-2.4.5
+ // It will be supported in jackson-cbor since 2.5
+ byte[] header = new byte[3];
+ header[0] = (byte) 0xd9;
+ header[1] = (byte) 0xd9;
+ header[2] = (byte) 0xf7;
+ generator.writeBytes(header, 0, header.length);
+ }
+
private byte[] serializeCBORData(String jsonData) {
CBORFactory factory = new CBORFactory();
@@ -524,6 +549,8 @@ public class CommonCrawlDataDumper {
try {
stream = new ByteArrayOutputStream();
generator = factory.createGenerator(stream);
+ // Writes CBOR tag
+ writeMagicHeader(generator);
generator.writeString(jsonData);
generator.flush();
stream.flush();