You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucy.apache.org by nw...@apache.org on 2015/08/06 18:19:48 UTC
[11/20] lucy-clownfish git commit: Upgrade libcmark to 0.21.0

Upgrade libcmark to 0.21.0


Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/89c7b809
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/89c7b809
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/89c7b809

Branch: refs/heads/master
Commit: 89c7b8096972697e9c9d8153f5b58c9f5274e20a
Parents: aec7214
Author: Nick Wellnhofer <we...@aevum.de>
Authored: Sat Jul 25 18:48:05 2015 +0200
Committer: Nick Wellnhofer <we...@aevum.de>
Committed: Thu Aug 6 18:19:19 2015 +0200

----------------------------------------------------------------------
 compiler/modules/CommonMark/COPYING             |    73 +-
 compiler/modules/CommonMark/README.md           |   298 +-
 compiler/modules/CommonMark/src/blocks.c        |   556 +-
 compiler/modules/CommonMark/src/buffer.c        |   229 +-
 compiler/modules/CommonMark/src/buffer.h        |    94 +-
 compiler/modules/CommonMark/src/chunk.h         |    33 +-
 compiler/modules/CommonMark/src/cmark.c         |    16 +-
 compiler/modules/CommonMark/src/cmark.h         |    93 +-
 compiler/modules/CommonMark/src/cmark_version.h |     7 +
 compiler/modules/CommonMark/src/commonmark.c    |   462 +
 compiler/modules/CommonMark/src/config.h        |     4 +
 compiler/modules/CommonMark/src/config.h.in     |    23 -
 compiler/modules/CommonMark/src/debug.h         |    36 -
 compiler/modules/CommonMark/src/entities.inc    |  2138 ++
 compiler/modules/CommonMark/src/houdini.h       |    19 +-
 .../modules/CommonMark/src/houdini_href_e.c     |     4 +-
 .../modules/CommonMark/src/houdini_html_e.c     |     6 +-
 .../modules/CommonMark/src/houdini_html_u.c     |    92 +-
 compiler/modules/CommonMark/src/html.c          |    75 +-
 .../modules/CommonMark/src/html_unescape.gperf  |  2130 --
 compiler/modules/CommonMark/src/html_unescape.h | 13375 ---------
 compiler/modules/CommonMark/src/inlines.c       |   455 +-
 compiler/modules/CommonMark/src/inlines.h       |     8 +-
 compiler/modules/CommonMark/src/iterator.c      |    16 +-
 compiler/modules/CommonMark/src/latex.c         |   430 +
 compiler/modules/CommonMark/src/libcmark.pc.in  |    10 -
 compiler/modules/CommonMark/src/man.c           |   240 +-
 compiler/modules/CommonMark/src/node.c          |   175 +-
 compiler/modules/CommonMark/src/node.h          |    22 +-
 compiler/modules/CommonMark/src/parser.h        |     9 +-
 compiler/modules/CommonMark/src/references.c    |     4 +-
 compiler/modules/CommonMark/src/references.h    |     4 +-
 compiler/modules/CommonMark/src/render.c        |   186 +
 compiler/modules/CommonMark/src/render.h        |    66 +
 compiler/modules/CommonMark/src/scanners.c      | 25069 +++++++++++------
 compiler/modules/CommonMark/src/scanners.h      |    46 +-
 compiler/modules/CommonMark/src/scanners.re     |   185 +-
 compiler/modules/CommonMark/src/utf8.c          |   102 +-
 compiler/modules/CommonMark/src/utf8.h          |     6 +-
 compiler/modules/CommonMark/src/xml.c           |    24 +-
 compiler/src/CFCCHtml.c                         |     5 +-
 compiler/src/CFCCMan.c                          |     6 +-
 compiler/src/CFCPerlPod.c                       |     6 +-
 43 files changed, 21879 insertions(+), 24958 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/COPYING
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/COPYING b/compiler/modules/CommonMark/COPYING
index 0bb3445..8099de3 100644
--- a/compiler/modules/CommonMark/COPYING
+++ b/compiler/modules/CommonMark/COPYING
@@ -13,10 +13,6 @@ modification, are permitted provided that the following conditions are met:
       disclaimer in the documentation and/or other materials provided
       with the distribution.
 
-    * Neither the name of John MacFarlane nor the names of other
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -108,10 +104,12 @@ DEALINGS IN THE SOFTWARE.
 
 -----
 
-normalize-reference.js is a slightly modified version of
-https://github.com/dmoscrop/fold-case:
+The normalization code in runtests.py was derived from the
+markdowntest project, Copyright 2013 Karl Dubost:
+
+The MIT License (MIT)
 
-Copyright Mathias Bynens <https://mathiasbynens.be/>
+Copyright (c) 2013 Karl Dubost
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
@@ -134,27 +132,43 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 -----
 
-The polyfill for String.fromCodePoint included in commonmark.js is
-Copyright Mathias Bynens <http://mathiasbynens.be/>
+The CommonMark spec (test/spec.txt) is
 
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
+Copyright (C) 2014-15 John MacFarlane
 
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
+Released under the Creative Commons CC-BY-SA 4.0 license:
+<http://creativecommons.org/licenses/by-sa/4.0/>.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+-----
+
+The test software in test/ is
+
+Copyright (c) 2014, John MacFarlane
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -----
 
@@ -169,5 +183,10 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of
 
 The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/README.md
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/README.md b/compiler/modules/CommonMark/README.md
index 4bbac88..4b73cb2 100644
--- a/compiler/modules/CommonMark/README.md
+++ b/compiler/modules/CommonMark/README.md
@@ -1,37 +1,66 @@
-CommonMark
-==========
+cmark
+=====
 
-CommonMark is a rationalized version of Markdown syntax,
-with a [spec][the spec] and BSD3-licensed reference
-implementations in C and JavaScript.
+[![Build Status]](https://travis-ci.org/jgm/cmark)
+[![Windows Build Status]](https://ci.appveyor.com/project/jgm/cmark)
 
-[Try it now!](http://spec.commonmark.org/dingus.html)
+`cmark` is the C reference implementation of [CommonMark], a
+rationalized version of Markdown syntax with a [spec][the spec].
+(For the JavaScript reference implementation, see
+[commonmark.js].)
 
-The implementations
--------------------
+It provides a shared library (`libcmark`) with functions for parsing
+CommonMark documents to an abstract syntax tree (AST), manipulating
+the AST, and rendering the document to HTML, groff man, LaTeX,
+CommonMark, or an XML representation of the AST.  It also provides a
+command-line program (`cmark`) for parsing and rendering CommonMark
+documents.
 
-The C implementation provides both a shared library (`libcmark`) and a
-standalone program `cmark` that converts CommonMark to HTML.  It is
-written in standard C99 and has no library dependencies.  The parser is
-very fast (see [benchmarks](benchmarks.md)).
+Advantages of this library:
+
+- **Portable.**  The library and program are written in standard
+  C99 and have no external dependencies.  They have been tested with
+  MSVC, gcc, tcc, and clang.
+
+- **Fast.** cmark can render a Markdown version of *War and Peace* in
+  the blink of an eye (127 milliseconds on a ten year old laptop,
+  vs. 100-400 milliseconds for an eye blink).  In our [benchmarks],
+  cmark is 10,000 times faster than the original `Markdown.pl`, and
+  on par with the very fastest available Markdown processors.
+
+- **Accurate.** The library passes all CommonMark conformance tests.
+
+- **Standardized.** The library can be expected to parse CommonMark
+  the same way as any other conforming parser.  So, for example,
+  you can use `commonmark.js` on the client to preview content that
+  will be rendered on the server using `cmark`.
+
+- **Robust.** The library has been extensively fuzz-tested using
+  [american fuzzy lop].  The test suite includes pathological cases
+  that bring many other Markdown parsers to a crawl (for example,
+  thousands-deep nested bracketed text or block quotes).
+
+- **Flexible.** CommonMark input is parsed to an AST which can be
+  manipulated programatically prior to rendering.
+
+- **Multiple renderers.**  Output in HTML, groff man, LaTeX, CommonMark,
+  and a custom XML format is supported. And it is easy to write new
+  renderers to support other formats.
+
+- **Free.** BSD2-licensed.
 
 It is easy to use `libcmark` in python, lua, ruby, and other dynamic
 languages: see the `wrappers/` subdirectory for some simple examples.
 
-The JavaScript implementation provides both an NPM package and a
-single JavaScript file, with no dependencies, that can be linked into
-an HTML page. For further information, see the
-[README in the js directory](js/README.md).
+There are also libraries that wrap `libcmark` for
+[go](https://github.com/rhinoman/go-commonmark),
+[Haskell](http://hackage.haskell.org/package/cmark),
+[ruby](https://github.com/gjtorikian/commonmarker),
+[Perl](https://metacpan.org/release/CommonMark), and
+[R](http://cran.r-project.org/package=commonmark).
 
-**A note on security:**
-Neither implementation attempts to sanitize link attributes or
-raw HTML.  If you use these libraries in applications that accept
-untrusted user input, you must run the output through an HTML
-sanitizer to protect against
-[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting).
-
-Installing (C)
---------------
+Installing
+----------
 
 Building the C program (`cmark`) and shared library (`libcmark`)
 requires [cmake].  If you modify `scanners.re`, then you will also
@@ -42,7 +71,10 @@ the repository to reduce build dependencies.
 If you have GNU make, you can simply `make`, `make test`, and `make
 install`.  This calls [cmake] to create a `Makefile` in the `build`
 directory, then uses that `Makefile` to create the executable and
-library.  The binaries can be found in `build/src`.
+library.  The binaries can be found in `build/src`.  The default
+installation prefix is `/usr/local`.  To change the installation
+prefix, pass the `INSTALL_PREFIX` variable if you run `make` for the
+first time: `make INSTALL_PREFIX=path`.
 
 For a more portable method, you can use [cmake] manually. [cmake] knows
 how to create build environments for many build systems.  For example,
@@ -60,19 +92,13 @@ Or, to create Xcode project files on OSX:
     mkdir build
     cd build
     cmake -G Xcode ..
-    make
-    make test
-    make install
+    open cmark.xcodeproj
 
 The GNU Makefile also provides a few other targets for developers.
 To run a benchmark:
 
     make bench
 
-To run a "fuzz test" against ten long randomly generated inputs:
-
-    make fuzztest
-
 To run a test for memory leaks using `valgrind`:
 
     make leakcheck
@@ -81,13 +107,20 @@ To reformat source code using `astyle`:
 
     make astyle
 
+To run a "fuzz test" against ten long randomly generated inputs:
+
+    make fuzztest
+
+To do a more systematic fuzz test with [american fuzzy lop]:
+
+    AFL_PATH=/path/to/afl_directory make afl
+
 To make a release tarball and zip archive:
 
     make archive
 
-
-Compiling for Windows
----------------------
+Installing (Windows)
+--------------------
 
 To compile with MSVC and NMAKE:
 
@@ -100,148 +133,23 @@ You can cross-compile a Windows binary and dll on linux if you have the
 
 The binaries will be in `build-mingw/windows/bin`.
 
-Installing (JavaScript)
------------------------
-
-The JavaScript library can be installed through `npm`:
+Usage
+-----
 
-    npm install commonmark
+Instructions for the use of the command line program and library can
+be found in the man pages in the `man` subdirectory.
 
-This includes a command-line converter called `commonmark`.
-
-If you want to use it in a client application, you can fetch
-a pre-built copy of `commonmark.js` from
-<http://spec.commonmark.org/js/commonmark.js>.
-
-For further information, see the
-[README in the js directory](js/README.md).
-
-The spec
+Security
 --------
 
-[The spec] contains over 500 embedded examples which serve as conformance
-tests. To run the tests using an executable `$PROG`:
-
-    python3 test/spec_tests.py --program $PROG
-
-If you want to extract the raw test data from the spec without
-actually running the tests, you can do:
-
-    python3 test/spec_tests.py --dump-tests
-
-and you'll get all the tests in JSON format.
-
-[The spec]:  http://spec.commonmark.org/0.13/
-
-The source of [the spec] is `spec.txt`.  This is basically a Markdown
-file, with code examples written in a shorthand form:
-
-    .
-    Markdown source
-    .
-    expected HTML output
-    .
-
-To build an HTML version of the spec, do `make spec.html`.  To build a
-PDF version, do `make spec.pdf`.  (Creating a PDF requires [pandoc]
-and a LaTeX installation.  Creating the HTML version requires only
-`libcmark` and `python3`.)
-
-The spec is written from the point of view of the human writer, not
-the computer reader.  It is not an algorithm---an English translation of
-a computer program---but a declarative description of what counts as a block
-quote, a code block, and each of the other structural elements that can
-make up a Markdown document.
-
-Because John Gruber's [canonical syntax
-description](http://daringfireball.net/projects/markdown/syntax) leaves
-many aspects of the syntax undetermined, writing a precise spec requires
-making a large number of decisions, many of them somewhat arbitrary.
-In making them, we have appealed to existing conventions and
-considerations of simplicity, readability, expressive power, and
-consistency.  We have tried to ensure that "normal" documents in the many
-incompatible existing implementations of Markdown will render, as far as
-possible, as their authors intended.  And we have tried to make the rules
-for different elements work together harmoniously.  In places where
-different decisions could have been made (for example, the rules
-governing list indentation), we have explained the rationale for
-my choices.  In a few cases, we have departed slightly from the canonical
-syntax description, in ways that we think further the goals of Markdown
-as stated in that description.
-
-For the most part, we have limited ourselves to the basic elements
-described in Gruber's canonical syntax description, eschewing extensions
-like footnotes and definition lists.  It is important to get the core
-right before considering such things. However, we have included a visible
-syntax for line breaks and fenced code blocks.
-
-Differences from original Markdown
-----------------------------------
-
-There are only a few places where this spec says things that contradict
-the canonical syntax description:
-
--   It allows all punctuation symbols to be backslash-escaped,
-    not just the symbols with special meanings in Markdown. We found
-    that it was just too hard to remember which symbols could be
-    escaped.
-
--   It introduces an alternative syntax for hard line
-    breaks, a backslash at the end of the line, supplementing the
-    two-spaces-at-the-end-of-line rule. This is motivated by persistent
-    complaints about the “invisible” nature of the two-space rule.
-
--   Link syntax has been made a bit more predictable (in a
-    backwards-compatible way). For example, `Markdown.pl` allows single
-    quotes around a title in inline links, but not in reference links.
-    This kind of difference is really hard for users to remember, so the
-    spec allows single quotes in both contexts.
-
--   The rule for HTML blocks differs, though in most real cases it
-    shouldn't make a difference. (See the section on HTML Blocks
-    for details.) The spec's proposal makes it easy to include Markdown
-    inside HTML block-level tags, if you want to, but also allows you to
-    exclude this. It is also makes parsing much easier, avoiding
-    expensive backtracking.
-
--   It does not collapse adjacent bird-track blocks into a single
-    blockquote:
-
-        > this is two
-
-        > blockquotes
-
-        > this is a single
-        >
-        > blockquote with two paragraphs
-
--   Rules for content in lists differ in a few respects, though (as with
-    HTML blocks), most lists in existing documents should render as
-    intended. There is some discussion of the choice points and
-    differences in the subsection of List Items entitled Motivation.
-    We think that the spec's proposal does better than any existing
-    implementation in rendering lists the way a human writer or reader
-    would intuitively understand them. (We could give numerous examples
-    of perfectly natural looking lists that nearly every existing
-    implementation flubs up.)
-
--   The spec stipulates that two blank lines break out of all list
-    contexts.  This is an attempt to deal with issues that often come up
-    when someone wants to have two adjacent lists, or a list followed by
-    an indented code block.
-
--   Changing bullet characters, or changing from bullets to numbers or
-    vice versa, starts a new list. We think that is almost always going
-    to be the writer's intent.
-
--   The number that begins an ordered list item may be followed by
-    either `.` or `)`. Changing the delimiter style starts a new
-    list.
-
--   The start number of an ordered list is significant.
-
--   Fenced code blocks are supported, delimited by either
-    backticks (```` ``` ```` or tildes (` ~~~ `).
+By default, the library will pass through raw HTML and potentially
+dangerous links (`javascript:`, `vbscript:`, `data:`, `file:`).
+
+It is recommended that users either disable this potentially unsafe
+feature by using the option `CMARK_OPT_SAFE` (or `--safe` with the
+command-line program), or run the output through an HTML sanitizer
+to protect against
+[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting).
 
 Contributing
 ------------
@@ -255,31 +163,21 @@ only for simple, clear, actionable issues.
 Authors
 -------
 
-The spec was written by John MacFarlane, drawing on
-
-- his experience writing and maintaining Markdown implementations in several
-  languages, including the first Markdown parser not based on regular
-  expression substitutions ([pandoc](http://github.com/jgm/pandoc)) and
-  the first markdown parsers based on PEG grammars
-  ([peg-markdown](http://github.com/jgm/peg-markdown),
-  [lunamark](http://github.com/jgm/lunamark))
-- a detailed examination of the differences between existing Markdown
-  implementations using [BabelMark 2](http://johnmacfarlane.net/babelmark2/),
-  and
-- extensive discussions with David Greenspan, Jeff Atwood, Vicent
-  Marti, Neil Williams, and Benjamin Dumke-von der Ehe.
-
-John MacFarlane was also responsible for the original versions of the
-C and JavaScript implementations.  The block parsing algorithm was
-worked out together with David Greenspan.  Vicent Marti
-optimized the C implementation for performance, increasing its speed
-tenfold.  Kārlis Gaņģis helped work out a better parsing algorithm
-for links and emphasis, eliminating several worst-case performance
-issues.  Nick Wellnhofer contributed many improvements, including
-most of the C library's API and its test harness.  Vitaly Puzrin
-has offered much good advice about the JavaScript implementation.
-
+John MacFarlane wrote the original library and program.
+The block parsing algorithm was worked out together with David
+Greenspan. Vicent Marti optimized the C implementation for
+performance, increasing its speed tenfold.  Kārlis Gaņģis helped
+work out a better parsing algorithm for links and emphasis,
+eliminating several worst-case performance issues.
+Nick Wellnhofer contributed many improvements, including
+most of the C library's API and its test harness.
+
+[benchmarks]: benchmarks.md
+[the spec]: http://spec.commonmark.org
+[CommonMark]: http://commonmark.org
 [cmake]: http://www.cmake.org/download/
-[pandoc]: http://johnmacfarlane.net/pandoc/
 [re2c]: http://re2c.org
-
+[commonmark.js]: https://github.com/jgm/commonmark.js
+[Build Status]: https://img.shields.io/travis/jgm/cmark/master.svg?style=flat
+[Windows Build Status]: https://ci.appveyor.com/api/projects/status/32r7s2skrgm9ubva?svg=true
+[american fuzzy lop]: http://lcamtuf.coredump.cx/afl/

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/blocks.c
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/blocks.c b/compiler/modules/CommonMark/src/blocks.c
index dafbb9b..f8b7495 100644
--- a/compiler/modules/CommonMark/src/blocks.c
+++ b/compiler/modules/CommonMark/src/blocks.c
@@ -13,18 +13,25 @@
 #include "inlines.h"
 #include "houdini.h"
 #include "buffer.h"
-#include "debug.h"
 
 #define CODE_INDENT 4
+#define TAB_STOP 4
+
 #define peek_at(i, n) (i)->data[n]
 
+static inline bool
+S_is_line_end_char(char c)
+{
+	return (c == '\n' || c == '\r');
+}
+
 static void
 S_parser_feed(cmark_parser *parser, const unsigned char *buffer, size_t len,
               bool eof);
 
 static void
 S_process_line(cmark_parser *parser, const unsigned char *buffer,
-               size_t bytes);
+               bufsize_t bytes);
 
 static cmark_node* make_block(cmark_node_type tag, int start_line, int start_column)
 {
@@ -43,14 +50,14 @@ static cmark_node* make_block(cmark_node_type tag, int start_line, int start_col
 	return e;
 }
 
-// Create a root document cmark_node.
+// Create a root document node.
 static cmark_node* make_document()
 {
 	cmark_node *e = make_block(NODE_DOCUMENT, 1, 1);
 	return e;
 }
 
-cmark_parser *cmark_parser_new()
+cmark_parser *cmark_parser_new(int options)
 {
 	cmark_parser *parser = (cmark_parser*)malloc(sizeof(cmark_parser));
 	cmark_node *document = make_document();
@@ -63,9 +70,16 @@ cmark_parser *cmark_parser_new()
 	parser->root = document;
 	parser->current = document;
 	parser->line_number = 0;
+	parser->offset = 0;
+	parser->column = 0;
+	parser->first_nonspace = 0;
+	parser->first_nonspace_column = 0;
+	parser->indent = 0;
+	parser->blank = false;
 	parser->curline = line;
 	parser->last_line_length = 0;
 	parser->linebuf = buf;
+	parser->options = options;
 
 	return parser;
 }
@@ -84,15 +98,19 @@ static cmark_node*
 finalize(cmark_parser *parser, cmark_node* b);
 
 // Returns true if line has only space characters, else false.
-static bool is_blank(cmark_strbuf *s, int offset)
+static bool is_blank(cmark_strbuf *s, bufsize_t offset)
 {
 	while (offset < s->size) {
 		switch (s->ptr[offset]) {
+		case '\r':
 		case '\n':
 			return true;
 		case ' ':
 			offset++;
 			break;
+		case '\t':
+			offset++;
+			break;
 		default:
 			return false;
 		}
@@ -116,7 +134,7 @@ static inline bool accepts_lines(cmark_node_type block_type)
 	        block_type == NODE_CODE_BLOCK);
 }
 
-static void add_line(cmark_node* node, cmark_chunk *ch, int offset)
+static void add_line(cmark_node* node, cmark_chunk *ch, bufsize_t offset)
 {
 	assert(node->open);
 	cmark_strbuf_put(&node->string_content, ch->data + offset, ch->len - offset);
@@ -124,12 +142,13 @@ static void add_line(cmark_node* node, cmark_chunk *ch, int offset)
 
 static void remove_trailing_blank_lines(cmark_strbuf *ln)
 {
-	int i;
+	bufsize_t i;
+	unsigned char c;
 
 	for (i = ln->size - 1; i >= 0; --i) {
-		unsigned char c = ln->ptr[i];
+		c = ln->ptr[i];
 
-		if (c != ' ' && c != '\t' && c != '\r' && c != '\n')
+		if (c != ' ' && c != '\t' && !S_is_line_end_char(c))
 			break;
 	}
 
@@ -138,12 +157,19 @@ static void remove_trailing_blank_lines(cmark_strbuf *ln)
 		return;
 	}
 
-	i = cmark_strbuf_strchr(ln, '\n', i);
-	if (i >= 0)
+
+	for(; i < ln->size; ++i) {
+		c = ln->ptr[i];
+
+		if (!S_is_line_end_char(c))
+			continue;
+
 		cmark_strbuf_truncate(ln, i);
+		break;
+	}
 }
 
-// Check to see if a cmark_node ends with a blank line, descending
+// Check to see if a node ends with a blank line, descending
 // if needed into lists and sublists.
 static bool ends_with_blank_line(cmark_node* node)
 {
@@ -184,18 +210,14 @@ static int break_out_of_lists(cmark_parser *parser, cmark_node ** bptr)
 static cmark_node*
 finalize(cmark_parser *parser, cmark_node* b)
 {
-	int firstlinelen;
-	int pos;
+	bufsize_t pos;
 	cmark_node* item;
 	cmark_node* subitem;
 	cmark_node* parent;
 
 	parent = b->parent;
 
-	// don't do anything if the cmark_node is already closed
-	if (!b->open)
-		return parent;
-
+	assert(b->open);  // shouldn't call finalize on closed blocks
 	b->open = false;
 
 	if (parser->curline->size == 0) {
@@ -206,9 +228,11 @@ finalize(cmark_parser *parser, cmark_node* b)
 	           (b->type == NODE_CODE_BLOCK && b->as.code.fenced) ||
 	           (b->type == NODE_HEADER && b->as.header.setext)) {
 		b->end_line = parser->line_number;
-		b->end_column = parser->curline->size -
-		                (parser->curline->ptr[parser->curline->size - 1] == '\n' ?
-		                 1 : 0);
+		b->end_column = parser->curline->size;
+		if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\n')
+			b->end_column -= 1;
+		if (b->end_column && parser->curline->ptr[b->end_column - 1] == '\r')
+			b->end_column -= 1;
 	} else {
 		b->end_line = parser->line_number - 1;
 		b->end_column = parser->last_line_length;
@@ -234,19 +258,27 @@ finalize(cmark_parser *parser, cmark_node* b)
 		} else {
 
 			// first line of contents becomes info
-			firstlinelen = cmark_strbuf_strchr(&b->string_content, '\n', 0);
+			for (pos = 0; pos < b->string_content.size; ++pos) {
+				if (S_is_line_end_char(b->string_content.ptr[pos]))
+					break;
+			}
+			assert(pos < b->string_content.size);
 
 			cmark_strbuf tmp = GH_BUF_INIT;
 			houdini_unescape_html_f(
 			    &tmp,
 			    b->string_content.ptr,
-			    firstlinelen
+			    pos
 			);
 			cmark_strbuf_trim(&tmp);
 			cmark_strbuf_unescape(&tmp);
 			b->as.code.info = cmark_chunk_buf_detach(&tmp);
 
-			cmark_strbuf_drop(&b->string_content, firstlinelen + 1);
+			if (b->string_content.ptr[pos] == '\r')
+				pos += 1;
+			if (b->string_content.ptr[pos] == '\n')
+				pos += 1;
+			cmark_strbuf_drop(&b->string_content, pos);
 		}
 		b->as.code.literal = cmark_chunk_buf_detach(&b->string_content);
 		break;
@@ -290,14 +322,14 @@ finalize(cmark_parser *parser, cmark_node* b)
 	return parent;
 }
 
-// Add a cmark_node as child of another.  Return pointer to child.
+// Add a node as child of another.  Return pointer to child.
 static cmark_node* add_child(cmark_parser *parser, cmark_node* parent,
                              cmark_node_type block_type, int start_column)
 {
 	assert(parent);
 
-	// if 'parent' isn't the kind of cmark_node that can accept this child,
-	// then back up til we hit a cmark_node that can.
+	// if 'parent' isn't the kind of node that can accept this child,
+	// then back up til we hit a node that can.
 	while (!can_contain(parent->type, block_type)) {
 		parent = finalize(parser, parent);
 	}
@@ -317,9 +349,9 @@ static cmark_node* add_child(cmark_parser *parser, cmark_node* parent,
 }
 
 
-// Walk through cmark_node and all children, recursively, parsing
+// Walk through node and all children, recursively, parsing
 // string content into inline content where appropriate.
-static void process_inlines(cmark_node* root, cmark_reference_map *refmap)
+static void process_inlines(cmark_node* root, cmark_reference_map *refmap, int options)
 {
 	cmark_iter *iter = cmark_iter_new(root);
 	cmark_node *cur;
@@ -330,7 +362,7 @@ static void process_inlines(cmark_node* root, cmark_reference_map *refmap)
 		if (ev_type == CMARK_EVENT_ENTER) {
 			if (cur->type == NODE_PARAGRAPH ||
 			    cur->type == NODE_HEADER) {
-				cmark_parse_inlines(cur, refmap);
+				cmark_parse_inlines(cur, refmap, options);
 			}
 		}
 	}
@@ -341,16 +373,16 @@ static void process_inlines(cmark_node* root, cmark_reference_map *refmap)
 // Attempts to parse a list item marker (bullet or enumerated).
 // On success, returns length of the marker, and populates
 // data with the details.  On failure, returns 0.
-static int parse_list_marker(cmark_chunk *input, int pos, cmark_list **dataptr)
+static bufsize_t parse_list_marker(cmark_chunk *input, bufsize_t pos, cmark_list **dataptr)
 {
 	unsigned char c;
-	int startpos;
+	bufsize_t startpos;
 	cmark_list *data;
 
 	startpos = pos;
 	c = peek_at(input, pos);
 
-	if ((c == '*' || c == '-' || c == '+') && !scan_hrule(input, pos)) {
+	if (c == '*' || c == '-' || c == '+') {
 		pos++;
 		if (!cmark_isspace(peek_at(input, pos))) {
 			return 0;
@@ -368,11 +400,16 @@ static int parse_list_marker(cmark_chunk *input, int pos, cmark_list **dataptr)
 		}
 	} else if (cmark_isdigit(c)) {
 		int start = 0;
+		int digits = 0;
 
 		do {
 			start = (10 * start) + (peek_at(input, pos) - '0');
 			pos++;
-		} while (cmark_isdigit(peek_at(input, pos)));
+			digits++;
+			// We limit to 9 digits to avoid overflow,
+			// assuming max int is 2^31 - 1
+			// This also seems to be the limit for 'start' in some browsers.
+		} while (digits < 9 && cmark_isdigit(peek_at(input, pos)));
 
 		c = peek_at(input, pos);
 		if (c == '.' || c == ')') {
@@ -419,15 +456,15 @@ static cmark_node *finalize_document(cmark_parser *parser)
 	}
 
 	finalize(parser, parser->root);
-	process_inlines(parser->root, parser->refmap);
+	process_inlines(parser->root, parser->refmap, parser->options);
 
 	return parser->root;
 }
 
-cmark_node *cmark_parse_file(FILE *f)
+cmark_node *cmark_parse_file(FILE *f, int options)
 {
 	unsigned char buffer[4096];
-	cmark_parser *parser = cmark_parser_new();
+	cmark_parser *parser = cmark_parser_new(options);
 	size_t bytes;
 	cmark_node *document;
 
@@ -444,9 +481,9 @@ cmark_node *cmark_parse_file(FILE *f)
 	return document;
 }
 
-cmark_node *cmark_parse_document(const char *buffer, size_t len)
+cmark_node *cmark_parse_document(const char *buffer, size_t len, int options)
 {
-	cmark_parser *parser = cmark_parser_new();
+	cmark_parser *parser = cmark_parser_new(options);
 	cmark_node *document;
 
 	S_parser_feed(parser, (const unsigned char *)buffer, len, true);
@@ -467,38 +504,58 @@ S_parser_feed(cmark_parser *parser, const unsigned char *buffer, size_t len,
               bool eof)
 {
 	const unsigned char *end = buffer + len;
+	static const uint8_t repl[] = {239, 191, 189};
 
 	while (buffer < end) {
-		const unsigned char *eol
-		    = (const unsigned char *)memchr(buffer, '\n',
-		                                    end - buffer);
-		size_t line_len;
-
-		if (eol) {
-			line_len = eol + 1 - buffer;
-		} else if (eof) {
-			line_len = end - buffer;
-		} else {
-			cmark_strbuf_put(parser->linebuf, buffer, end - buffer);
-			break;
+		const unsigned char *eol;
+		bufsize_t chunk_len;
+		bool process = false;
+		for (eol = buffer; eol < end; ++eol) {
+			if (S_is_line_end_char(*eol)) {
+				if (eol < end && *eol == '\r')
+					eol++;
+				if (eol < end && *eol == '\n')
+					eol++;
+				process = true;
+				break;
+			}
+			if (*eol == '\0' && eol < end) {
+				break;
+			}
+		}
+		if (eol >= end && eof) {
+			process = true;
 		}
 
-		if (parser->linebuf->size > 0) {
-			cmark_strbuf_put(parser->linebuf, buffer, line_len);
-			S_process_line(parser, parser->linebuf->ptr,
-			               parser->linebuf->size);
-			cmark_strbuf_clear(parser->linebuf);
+		chunk_len = cmark_strbuf_check_bufsize(eol - buffer);
+		if (process) {
+			if (parser->linebuf->size > 0) {
+				cmark_strbuf_put(parser->linebuf, buffer, chunk_len);
+				S_process_line(parser, parser->linebuf->ptr,
+				               parser->linebuf->size);
+				cmark_strbuf_clear(parser->linebuf);
+			} else {
+				S_process_line(parser, buffer, chunk_len);
+			}
 		} else {
-			S_process_line(parser, buffer, line_len);
+			if (eol < end && *eol == '\0') {
+				// omit NULL byte
+				cmark_strbuf_put(parser->linebuf, buffer, chunk_len);
+				// add replacement character
+				cmark_strbuf_put(parser->linebuf, repl, 3);
+				chunk_len += 1; // so we advance the buffer past NULL
+			} else {
+				cmark_strbuf_put(parser->linebuf, buffer, chunk_len);
+			}
 		}
 
-		buffer += line_len;
+		buffer += chunk_len;
 	}
 }
 
 static void chop_trailing_hashtags(cmark_chunk *ch)
 {
-	int n, orig_n;
+	bufsize_t n, orig_n;
 
 	cmark_chunk_rtrim(ch);
 	orig_n = n = ch->len - 1;
@@ -515,29 +572,77 @@ static void chop_trailing_hashtags(cmark_chunk *ch)
 }
 
 static void
-S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
+S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input)
+{
+	char c;
+	int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
+
+	parser->first_nonspace = parser->offset;
+	parser->first_nonspace_column = parser->column;
+	while ((c = peek_at(input, parser->first_nonspace))) {
+		if (c == ' ') {
+			parser->first_nonspace += 1;
+			parser->first_nonspace_column += 1;
+			chars_to_tab = chars_to_tab - 1;
+			if (chars_to_tab == 0) {
+				chars_to_tab = TAB_STOP;
+			}
+		} else if (c == '\t') {
+			parser->first_nonspace += 1;
+			parser->first_nonspace_column += chars_to_tab;
+			chars_to_tab = TAB_STOP;
+		} else {
+			break;
+		}
+	}
+
+	parser->indent = parser->first_nonspace_column - parser->column;
+	parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
+}
+
+static void
+S_advance_offset(cmark_parser *parser, cmark_chunk *input, bufsize_t count, bool columns)
+{
+	char c;
+	int chars_to_tab;
+	while (count > 0 && (c = peek_at(input, parser->offset))) {
+		if (c == '\t') {
+			chars_to_tab = 4 - (parser->column % TAB_STOP);
+			parser->column += chars_to_tab;
+			parser->offset += 1;
+			count -= (columns ? chars_to_tab : 1);
+		} else {
+			parser->offset += 1;
+			parser->column += 1; // assume ascii; block starts are ascii
+			count -= 1;
+		}
+	}
+}
+
+
+static void
+S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t bytes)
 {
 	cmark_node* last_matched_container;
-	int offset = 0;
-	int matched = 0;
+	bufsize_t matched = 0;
 	int lev = 0;
 	int i;
 	cmark_list *data = NULL;
 	bool all_matched = true;
 	cmark_node* container;
-	cmark_node* cur = parser->current;
-	bool blank = false;
-	int first_nonspace;
-	int indent;
+	bool indented;
 	cmark_chunk input;
+	bool maybe_lazy;
 
-	utf8proc_detab(parser->curline, buffer, bytes);
-
-	// Add a newline to the end if not present:
-	// TODO this breaks abstraction:
-	if (parser->curline->ptr[parser->curline->size - 1] != '\n') {
-		cmark_strbuf_putc(parser->curline, '\n');
+	if (parser->options & CMARK_OPT_VALIDATE_UTF8) {
+		utf8proc_check(parser->curline, buffer, bytes);
+	} else {
+		cmark_strbuf_put(parser->curline, buffer, bytes);
 	}
+	parser->offset = 0;
+	parser->column = 0;
+	parser->blank = false;
+
 	input.data = parser->curline->ptr;
 	input.len = parser->curline->size;
 
@@ -546,38 +651,33 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 
 	parser->line_number++;
 
-	// for each containing cmark_node, try to parse the associated line start.
-	// bail out on failure:  container will point to the last matching cmark_node.
+	// for each containing node, try to parse the associated line start.
+	// bail out on failure:  container will point to the last matching node.
 
 	while (container->last_child && container->last_child->open) {
 		container = container->last_child;
 
-		first_nonspace = offset;
-		while (peek_at(&input, first_nonspace) == ' ') {
-			first_nonspace++;
-		}
-
-		indent = first_nonspace - offset;
-		blank = peek_at(&input, first_nonspace) == '\n';
+		S_find_first_nonspace(parser, &input);
 
 		if (container->type == NODE_BLOCK_QUOTE) {
-			matched = indent <= 3 && peek_at(&input, first_nonspace) == '>';
+			matched = parser->indent <= 3 && peek_at(&input, parser->first_nonspace) == '>';
 			if (matched) {
-				offset = first_nonspace + 1;
-				if (peek_at(&input, offset) == ' ')
-					offset++;
+				S_advance_offset(parser, &input, parser->indent + 1, true);
+				if (peek_at(&input, parser->offset) == ' ')
+					parser->offset++;
 			} else {
 				all_matched = false;
 			}
 
 		} else if (container->type == NODE_ITEM) {
-
-			if (indent >= container->as.list.marker_offset +
+			if (parser->indent >= container->as.list.marker_offset +
 			    container->as.list.padding) {
-				offset += container->as.list.marker_offset +
-				          container->as.list.padding;
-			} else if (blank) {
-				offset = first_nonspace;
+				S_advance_offset(parser, &input,
+				                 container->as.list.marker_offset +
+				                 container->as.list.padding, true);
+			} else if (parser->blank) {
+				S_advance_offset(parser, &input,
+				                 parser->first_nonspace - parser->offset, false);
 			} else {
 				all_matched = false;
 			}
@@ -585,34 +685,36 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 		} else if (container->type == NODE_CODE_BLOCK) {
 
 			if (!container->as.code.fenced) { // indented
-				if (indent >= CODE_INDENT) {
-					offset += CODE_INDENT;
-				} else if (blank) {
-					offset = first_nonspace;
+				if (parser->indent >= CODE_INDENT) {
+					S_advance_offset(parser, &input, CODE_INDENT, true);
+				} else if (parser->blank) {
+					S_advance_offset(parser, &input,
+					                 parser->first_nonspace - parser->offset,
+					                 false);
 				} else {
 					all_matched = false;
 				}
 			} else { // fenced
 				matched = 0;
-				if (indent <= 3 &&
-					(peek_at(&input, first_nonspace) ==
-					 container->as.code.fence_char)) {
+				if (parser->indent <= 3 &&
+				    (peek_at(&input, parser->first_nonspace) ==
+				     container->as.code.fence_char)) {
 					matched = scan_close_code_fence(&input,
-							first_nonspace);
+					                                parser->first_nonspace);
 				}
 				if (matched >= container->as.code.fence_length) {
 					// closing fence - and since we're at
 					// the end of a line, we can return:
 					all_matched = false;
-					offset += matched;
-					finalize(parser, container);
+					S_advance_offset(parser, &input, matched, false);
+					parser->current = finalize(parser, container);
 					goto finished;
 				} else {
-					// skip opt. spaces of fence offset
+					// skip opt. spaces of fence parser->offset
 					i = container->as.code.fence_offset;
 					while (i > 0 &&
-					    peek_at(&input, offset) == ' ') {
-						offset++;
+					       peek_at(&input, parser->offset) == ' ') {
+						S_advance_offset(parser, &input, 1, false);
 						i--;
 					}
 				}
@@ -624,20 +726,38 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 
 		} else if (container->type == NODE_HTML) {
 
-			if (blank) {
-				all_matched = false;
+			switch (container->as.html_block_type) {
+			case 1:
+			case 2:
+			case 3:
+			case 4:
+			case 5:
+				// these types of blocks can accept blanks
+				break;
+			case 6:
+			case 7:
+				if (parser->blank) {
+					all_matched = false;
+				}
+				break;
+			default:
+				fprintf(stderr,
+				        "Error (%s:%d): Unknown HTML block type %d\n",
+				        __FILE__, __LINE__,
+				        container->as.html_block_type);
+				exit(1);
 			}
 
 		} else if (container->type == NODE_PARAGRAPH) {
 
-			if (blank) {
+			if (parser->blank) {
 				all_matched = false;
 			}
 
 		}
 
 		if (!all_matched) {
-			container = container->parent;  // back up to last matching cmark_node
+			container = container->parent;  // back up to last matching node
 			break;
 		}
 	}
@@ -645,48 +765,33 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 	last_matched_container = container;
 
 	// check to see if we've hit 2nd blank line, break out of list:
-	if (blank && container->last_line_blank) {
+	if (parser->blank && container->last_line_blank) {
 		break_out_of_lists(parser, &container);
 	}
 
-	// unless last matched container is code cmark_node, try new container starts:
+	maybe_lazy = parser->current->type == NODE_PARAGRAPH;
+	// try new container starts:
 	while (container->type != NODE_CODE_BLOCK &&
 	       container->type != NODE_HTML) {
 
-		first_nonspace = offset;
-		while (peek_at(&input, first_nonspace) == ' ')
-			first_nonspace++;
-
-		indent = first_nonspace - offset;
-		blank = peek_at(&input, first_nonspace) == '\n';
-
-		if (indent >= CODE_INDENT) {
-			if (cur->type != NODE_PARAGRAPH && !blank) {
-				offset += CODE_INDENT;
-				container = add_child(parser, container, NODE_CODE_BLOCK, offset + 1);
-				container->as.code.fenced = false;
-				container->as.code.fence_char = 0;
-				container->as.code.fence_length = 0;
-				container->as.code.fence_offset = 0;
-				container->as.code.info = cmark_chunk_literal("");
-			} else { // indent > 4 in lazy line
-				break;
-			}
+		S_find_first_nonspace(parser, &input);
+		indented = parser->indent >= CODE_INDENT;
 
-		} else if (peek_at(&input, first_nonspace) == '>') {
+		if (!indented && peek_at(&input, parser->first_nonspace) == '>') {
 
-			offset = first_nonspace + 1;
+			S_advance_offset(parser, &input, parser->first_nonspace + 1 - parser->offset, false);
 			// optional following character
-			if (peek_at(&input, offset) == ' ')
-				offset++;
-			container = add_child(parser, container, NODE_BLOCK_QUOTE, offset + 1);
+			if (peek_at(&input, parser->offset) == ' ')
+				S_advance_offset(parser, &input, 1, false);
+			container = add_child(parser, container, NODE_BLOCK_QUOTE, parser->offset + 1);
 
-		} else if ((matched = scan_atx_header_start(&input, first_nonspace))) {
+		} else if (!indented && (matched = scan_atx_header_start(&input, parser->first_nonspace))) {
 
-			offset = first_nonspace + matched;
-			container = add_child(parser, container, NODE_HEADER, offset + 1);
+			S_advance_offset(parser, &input,
+			                 parser->first_nonspace + matched - parser->offset, false);
+			container = add_child(parser, container, NODE_HEADER, parser->offset + 1);
 
-			int hashpos = cmark_chunk_strchr(&input, '#', first_nonspace);
+			bufsize_t hashpos = cmark_chunk_strchr(&input, '#', parser->first_nonspace);
 			int level = 0;
 
 			while (peek_at(&input, hashpos) == '#') {
@@ -696,78 +801,99 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 			container->as.header.level = level;
 			container->as.header.setext = false;
 
-		} else if ((matched = scan_open_code_fence(&input, first_nonspace))) {
+		} else if (!indented && (matched = scan_open_code_fence(&input, parser->first_nonspace))) {
 
-			container = add_child(parser, container, NODE_CODE_BLOCK, first_nonspace + 1);
+			container = add_child(parser, container, NODE_CODE_BLOCK, parser->first_nonspace + 1);
 			container->as.code.fenced = true;
-			container->as.code.fence_char = peek_at(&input, first_nonspace);
+			container->as.code.fence_char = peek_at(&input, parser->first_nonspace);
 			container->as.code.fence_length = matched;
-			container->as.code.fence_offset = first_nonspace - offset;
+			container->as.code.fence_offset = parser->first_nonspace - parser->offset;
 			container->as.code.info = cmark_chunk_literal("");
-			offset = first_nonspace + matched;
+			S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
 
-		} else if ((matched = scan_html_block_tag(&input, first_nonspace))) {
+		} else if (!indented &&
+		           ((matched = scan_html_block_start(&input, parser->first_nonspace)) ||
+		            (container->type != NODE_PARAGRAPH &&
+		             (matched = scan_html_block_start_7(&input, parser->first_nonspace))))) {
 
-			container = add_child(parser, container, NODE_HTML, first_nonspace + 1);
-			// note, we don't adjust offset because the tag is part of the text
+			container = add_child(parser, container, NODE_HTML, parser->first_nonspace + 1);
+			container->as.html_block_type = matched;
+			// note, we don't adjust parser->offset because the tag is part of the text
 
-		} else if (container->type == NODE_PARAGRAPH &&
-		           (lev = scan_setext_header_line(&input, first_nonspace)) &&
+		} else if (!indented &&
+		           container->type == NODE_PARAGRAPH &&
+		           (lev = scan_setext_header_line(&input, parser->first_nonspace)) &&
 		           // check that there is only one line in the paragraph:
-		           cmark_strbuf_strrchr(&container->string_content, '\n',
-		                                cmark_strbuf_len(&container->string_content) - 2) < 0) {
+		           (cmark_strbuf_strrchr(&container->string_content, '\n',
+		                                 cmark_strbuf_len(&container->string_content) - 2) < 0)) {
 
 			container->type = NODE_HEADER;
 			container->as.header.level = lev;
 			container->as.header.setext = true;
-			offset = input.len - 1;
+			S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
 
-		} else if (!(container->type == NODE_PARAGRAPH && !all_matched) &&
-		           (matched = scan_hrule(&input, first_nonspace))) {
+		} else if (!indented &&
+		           !(container->type == NODE_PARAGRAPH &&
+		             !all_matched) &&
+		           (matched = scan_hrule(&input, parser->first_nonspace))) {
 
 			// it's only now that we know the line is not part of a setext header:
-			container = add_child(parser, container, NODE_HRULE, first_nonspace + 1);
+			container = add_child(parser, container, NODE_HRULE, parser->first_nonspace + 1);
 			container = finalize(parser, container);
-			offset = input.len - 1;
+			S_advance_offset(parser, &input, input.len - 1 - parser->offset, false);
 
-		} else if ((matched = parse_list_marker(&input, first_nonspace, &data))) {
+		} else if ((matched = parse_list_marker(&input, parser->first_nonspace, &data)) &&
+		           (!indented || container->type == NODE_LIST)) {
+			// Note that we can have new list items starting with >= 4
+			// spaces indent, as long as the list container is still open.
 
 			// compute padding:
-			offset = first_nonspace + matched;
+			S_advance_offset(parser, &input, parser->first_nonspace + matched - parser->offset, false);
 			i = 0;
-			while (i <= 5 && peek_at(&input, offset + i) == ' ') {
+			while (i <= 5 && peek_at(&input, parser->offset + i) == ' ') {
 				i++;
 			}
 			// i = number of spaces after marker, up to 5
-			if (i >= 5 || i < 1 || peek_at(&input, offset) == '\n') {
+			if (i >= 5 || i < 1 ||
+			    S_is_line_end_char(peek_at(&input, parser->offset))) {
 				data->padding = matched + 1;
 				if (i > 0) {
-					offset += 1;
+					S_advance_offset(parser, &input, 1, false);
 				}
 			} else {
 				data->padding = matched + i;
-				offset += i;
+				S_advance_offset(parser, &input, i, true);
 			}
 
 			// check container; if it's a list, see if this list item
 			// can continue the list; otherwise, create a list container.
 
-			data->marker_offset = indent;
+			data->marker_offset = parser->indent;
 
 			if (container->type != NODE_LIST ||
 			    !lists_match(&container->as.list, data)) {
 				container = add_child(parser, container, NODE_LIST,
-				                      first_nonspace + 1);
+				                      parser->first_nonspace + 1);
 
 				memcpy(&container->as.list, data, sizeof(*data));
 			}
 
 			// add the list item
 			container = add_child(parser, container, NODE_ITEM,
-			                      first_nonspace + 1);
+			                      parser->first_nonspace + 1);
 			/* TODO: static */
 			memcpy(&container->as.list, data, sizeof(*data));
 			free(data);
+
+		} else if (indented && !maybe_lazy && !parser->blank) {
+			S_advance_offset(parser, &input, CODE_INDENT, true);
+			container = add_child(parser, container, NODE_CODE_BLOCK, parser->offset + 1);
+			container->as.code.fenced = false;
+			container->as.code.fence_char = 0;
+			container->as.code.fence_length = 0;
+			container->as.code.fence_offset = 0;
+			container->as.code.info = cmark_chunk_literal("");
+
 		} else {
 			break;
 		}
@@ -776,19 +902,15 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 			// if it's a line container, it can't contain other containers
 			break;
 		}
+		maybe_lazy = false;
 	}
 
-	// what remains at offset is a text line.  add the text to the
+	// what remains at parser->offset is a text line.  add the text to the
 	// appropriate container.
 
-	first_nonspace = offset;
-	while (peek_at(&input, first_nonspace) == ' ')
-		first_nonspace++;
-
-	indent = first_nonspace - offset;
-	blank = peek_at(&input, first_nonspace) == '\n';
+	S_find_first_nonspace(parser, &input);
 
-	if (blank && container->last_child) {
+	if (parser->blank && container->last_child) {
 		container->last_child->last_line_blank = true;
 	}
 
@@ -796,7 +918,7 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 	// and we don't count blanks in fenced code for purposes of tight/loose
 	// lists or breaking out of lists.  we also don't set last_line_blank
 	// on an empty list item.
-	container->last_line_blank = (blank &&
+	container->last_line_blank = (parser->blank &&
 	                              container->type != NODE_BLOCK_QUOTE &&
 	                              container->type != NODE_HEADER &&
 	                              !(container->type == NODE_CODE_BLOCK &&
@@ -811,28 +933,68 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 		cont = cont->parent;
 	}
 
-	if (cur != last_matched_container &&
+	if (parser->current != last_matched_container &&
 	    container == last_matched_container &&
-	    !blank &&
-	    cur->type == NODE_PARAGRAPH &&
-	    cmark_strbuf_len(&cur->string_content) > 0) {
+	    !parser->blank &&
+	    parser->current->type == NODE_PARAGRAPH &&
+	    cmark_strbuf_len(&parser->current->string_content) > 0) {
 
-		add_line(cur, &input, offset);
+		add_line(parser->current, &input, parser->offset);
 
 	} else { // not a lazy continuation
 
 		// finalize any blocks that were not matched and set cur to container:
-		while (cur != last_matched_container) {
-			cur = finalize(parser, cur);
-			assert(cur != NULL);
+		while (parser->current != last_matched_container) {
+			parser->current = finalize(parser, parser->current);
+			assert(parser->current != NULL);
 		}
 
-		if (container->type == NODE_CODE_BLOCK ||
-		    container->type == NODE_HTML) {
+		if (container->type == NODE_CODE_BLOCK) {
+
+			add_line(container, &input, parser->offset);
+
+		} else if (container->type == NODE_HTML) {
+
+			add_line(container, &input, parser->offset);
+
+			int matches_end_condition;
+			switch (container->as.html_block_type) {
+			case 1:
+				// </script>, </style>, </pre>
+				matches_end_condition =
+				    scan_html_block_end_1(&input, parser->first_nonspace);
+				break;
+			case 2:
+				// -->
+				matches_end_condition =
+				    scan_html_block_end_2(&input, parser->first_nonspace);
+				break;
+			case 3:
+				// ?>
+				matches_end_condition =
+				    scan_html_block_end_3(&input, parser->first_nonspace);
+				break;
+			case 4:
+				// >
+				matches_end_condition =
+				    scan_html_block_end_4(&input, parser->first_nonspace);
+				break;
+			case 5:
+				// ]]>
+				matches_end_condition =
+				    scan_html_block_end_5(&input, parser->first_nonspace);
+				break;
+			default:
+				matches_end_condition = 0;
+				break;
+			}
 
-			add_line(container, &input, offset);
+			if (matches_end_condition) {
+				container = finalize(parser, container);
+				assert(parser->current != NULL);
+			}
 
-		} else if (blank) {
+		} else if (parser->blank) {
 
 			// ??? do nothing
 
@@ -842,22 +1004,26 @@ S_process_line(cmark_parser *parser, const unsigned char *buffer, size_t bytes)
 			    container->as.header.setext == false) {
 				chop_trailing_hashtags(&input);
 			}
-			add_line(container, &input, first_nonspace);
+			add_line(container, &input, parser->first_nonspace);
 
 		} else {
 			// create paragraph container for line
-			container = add_child(parser, container, NODE_PARAGRAPH, first_nonspace + 1);
-			add_line(container, &input, first_nonspace);
+			container = add_child(parser, container, NODE_PARAGRAPH, parser->first_nonspace + 1);
+			add_line(container, &input, parser->first_nonspace);
 
 		}
 
 		parser->current = container;
 	}
 finished:
-	parser->last_line_length = parser->curline->size -
-	                           (parser->curline->ptr[parser->curline->size - 1] == '\n' ?
-	                            1 : 0);
-	;
+	parser->last_line_length = parser->curline->size;
+	if (parser->last_line_length &&
+	    parser->curline->ptr[parser->last_line_length - 1] == '\n')
+		parser->last_line_length -= 1;
+	if (parser->last_line_length &&
+	    parser->curline->ptr[parser->last_line_length - 1] == '\r')
+		parser->last_line_length -= 1;
+
 	cmark_strbuf_clear(parser->curline);
 
 }
@@ -871,7 +1037,13 @@ cmark_node *cmark_parser_finish(cmark_parser *parser)
 	}
 
 	finalize_document(parser);
+
+	if (parser->options & CMARK_OPT_NORMALIZE) {
+		cmark_consolidate_text_nodes(parser->root);
+	}
+
 	cmark_strbuf_free(parser->curline);
+
 #if CMARK_DEBUG_NODES
 	if (cmark_node_check(parser->root, stderr)) {
 		abort();

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/buffer.c
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/buffer.c b/compiler/modules/CommonMark/src/buffer.c
index 0df6561..e07fba6 100644
--- a/compiler/modules/CommonMark/src/buffer.c
+++ b/compiler/modules/CommonMark/src/buffer.c
@@ -4,6 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 
 #include "config.h"
 #include "cmark_ctype.h"
@@ -13,83 +14,90 @@
  * assume ptr is non-NULL and zero terminated even for new cmark_strbufs.
  */
 unsigned char cmark_strbuf__initbuf[1];
-unsigned char cmark_strbuf__oom[1];
-
-#define ENSURE_SIZE(b, d)					\
-	if ((d) > buf->asize && cmark_strbuf_grow(b, (d)) < 0)	\
-		return -1;
 
 #ifndef MIN
 #define MIN(x,y)  ((x<y) ? x : y)
 #endif
 
-void cmark_strbuf_init(cmark_strbuf *buf, int initial_size)
+void cmark_strbuf_init(cmark_strbuf *buf, bufsize_t initial_size)
 {
 	buf->asize = 0;
 	buf->size = 0;
 	buf->ptr = cmark_strbuf__initbuf;
 
-	if (initial_size)
+	if (initial_size > 0)
 		cmark_strbuf_grow(buf, initial_size);
 }
 
-int cmark_strbuf_try_grow(cmark_strbuf *buf, int target_size, bool mark_oom)
+void cmark_strbuf_overflow_err()
 {
-	unsigned char *new_ptr;
-	int new_size;
+	fprintf(stderr, "String buffer overflow");
+	abort();
+}
 
-	if (buf->ptr == cmark_strbuf__oom)
-		return -1;
+static inline void
+S_strbuf_grow_by(cmark_strbuf *buf, size_t add)
+{
+	size_t target_size = (size_t)buf->size + add;
 
-	if (target_size <= buf->asize)
-		return 0;
+	if (target_size < add             /* Integer overflow. */
+	    || target_size > BUFSIZE_MAX  /* Truncation overflow. */
+	   ) {
+		cmark_strbuf_overflow_err();
+		return; /* unreachable */
+	}
+
+	if ((bufsize_t)target_size >= buf->asize)
+		cmark_strbuf_grow(buf, (bufsize_t)target_size);
+}
+
+void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size)
+{
+	unsigned char *new_ptr;
+
+	if (target_size < buf->asize)
+		return;
 
 	if (buf->asize == 0) {
-		new_size = target_size;
 		new_ptr = NULL;
 	} else {
-		new_size = buf->asize;
 		new_ptr = buf->ptr;
 	}
 
-	/* grow the buffer size by 1.5, until it's big enough
-	 * to fit our target size */
-	while (new_size < target_size)
-		new_size = (new_size << 1) - (new_size >> 1);
+	/* Oversize the buffer by 50% to guarantee amortized linear time
+	 * complexity on append operations. */
+	size_t new_size = (size_t)target_size + (size_t)target_size / 2;
+
+	/* Account for terminating null byte. */
+	new_size += 1;
 
 	/* round allocation up to multiple of 8 */
 	new_size = (new_size + 7) & ~7;
 
+	if (new_size < (size_t)target_size  /* Integer overflow. */
+	    || new_size > BUFSIZE_MAX       /* Truncation overflow. */
+	   ) {
+		if (target_size >= BUFSIZE_MAX) {
+			/* No space for terminating null byte. */
+			cmark_strbuf_overflow_err();
+			return; /* unreachable */
+		}
+		/* Oversize by the maximum possible amount. */
+		new_size = BUFSIZE_MAX;
+	}
+
 	new_ptr = (unsigned char *)realloc(new_ptr, new_size);
 
 	if (!new_ptr) {
-		if (mark_oom)
-			buf->ptr = cmark_strbuf__oom;
-		return -1;
+		perror("realloc in cmark_strbuf_grow");
+		abort();
 	}
 
-	buf->asize = new_size;
+	buf->asize = (bufsize_t)new_size;
 	buf->ptr   = new_ptr;
-
-	/* truncate the existing buffer size if necessary */
-	if (buf->size >= buf->asize)
-		buf->size = buf->asize - 1;
-	buf->ptr[buf->size] = '\0';
-
-	return 0;
-}
-
-int cmark_strbuf_grow(cmark_strbuf *buf, int target_size)
-{
-	return cmark_strbuf_try_grow(buf, target_size, true);
-}
-
-bool cmark_strbuf_oom(const cmark_strbuf *buf)
-{
-	return (buf->ptr == cmark_strbuf__oom);
 }
 
-size_t cmark_strbuf_len(const cmark_strbuf *buf)
+bufsize_t cmark_strbuf_len(const cmark_strbuf *buf)
 {
 	return buf->size;
 }
@@ -98,7 +106,7 @@ void cmark_strbuf_free(cmark_strbuf *buf)
 {
 	if (!buf) return;
 
-	if (buf->ptr != cmark_strbuf__initbuf && buf->ptr != cmark_strbuf__oom)
+	if (buf->ptr != cmark_strbuf__initbuf)
 		free(buf->ptr);
 
 	cmark_strbuf_init(buf, 0);
@@ -112,106 +120,106 @@ void cmark_strbuf_clear(cmark_strbuf *buf)
 		buf->ptr[0] = '\0';
 }
 
-int cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data, int len)
+void cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data, bufsize_t len)
 {
 	if (len <= 0 || data == NULL) {
 		cmark_strbuf_clear(buf);
 	} else {
 		if (data != buf->ptr) {
-			ENSURE_SIZE(buf, len + 1);
+			if (len >= buf->asize)
+				cmark_strbuf_grow(buf, len);
 			memmove(buf->ptr, data, len);
 		}
 		buf->size = len;
 		buf->ptr[buf->size] = '\0';
 	}
-	return 0;
 }
 
-int cmark_strbuf_sets(cmark_strbuf *buf, const char *string)
+void cmark_strbuf_sets(cmark_strbuf *buf, const char *string)
 {
-	return cmark_strbuf_set(buf,
-	                        (const unsigned char *)string,
-	                        string ? strlen(string) : 0);
+	cmark_strbuf_set(buf, (const unsigned char *)string,
+	                 string ? cmark_strbuf_safe_strlen(string) : 0);
 }
 
-int cmark_strbuf_putc(cmark_strbuf *buf, int c)
+void cmark_strbuf_putc(cmark_strbuf *buf, int c)
 {
-	ENSURE_SIZE(buf, buf->size + 2);
+	S_strbuf_grow_by(buf, 1);
 	buf->ptr[buf->size++] = c;
 	buf->ptr[buf->size] = '\0';
-	return 0;
 }
 
-int cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data, int len)
+void cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data, bufsize_t len)
 {
 	if (len <= 0)
-		return 0;
+		return;
 
-	ENSURE_SIZE(buf, buf->size + len + 1);
+	S_strbuf_grow_by(buf, len);
 	memmove(buf->ptr + buf->size, data, len);
 	buf->size += len;
 	buf->ptr[buf->size] = '\0';
-	return 0;
 }
 
-int cmark_strbuf_puts(cmark_strbuf *buf, const char *string)
+void cmark_strbuf_puts(cmark_strbuf *buf, const char *string)
 {
-	return cmark_strbuf_put(buf, (const unsigned char *)string, strlen(string));
+	cmark_strbuf_put(buf, (const unsigned char *)string,
+	                 cmark_strbuf_safe_strlen(string));
 }
 
-int cmark_strbuf_vprintf(cmark_strbuf *buf, const char *format, va_list ap)
+void cmark_strbuf_vprintf(cmark_strbuf *buf, const char *format, va_list ap)
 {
-	const int expected_size = buf->size + (strlen(format) * 2);
-	int len;
-
-	ENSURE_SIZE(buf, expected_size);
+	size_t expected_size = strlen(format);
+	if (expected_size <= SIZE_MAX / 2)
+		expected_size *= 2;
+	S_strbuf_grow_by(buf, expected_size);
 
 	while (1) {
 		va_list args;
 		va_copy(args, ap);
 
-		len = vsnprintf(
-		          (char *)buf->ptr + buf->size,
-		          buf->asize - buf->size,
-		          format, args
-		      );
+		int len = vsnprintf(
+		              (char *)buf->ptr + buf->size,
+		              buf->asize - buf->size,
+		              format, args
+		          );
+#ifndef HAVE_C99_SNPRINTF
+		// Assume we're on Windows.
+		if (len < 0) {
+			len = _vscprintf(format, args);
+		}
+#endif
 
 		va_end(args);
 
 		if (len < 0) {
-			free(buf->ptr);
-			buf->ptr = cmark_strbuf__oom;
-			return -1;
+			perror("vsnprintf in cmark_strbuf_vprintf");
+			abort();
 		}
 
-		if (len + 1 <= buf->asize - buf->size) {
+		if ((size_t)len < (size_t)(buf->asize - buf->size)) {
 			buf->size += len;
 			break;
 		}
 
-		ENSURE_SIZE(buf, buf->size + len + 1);
+		S_strbuf_grow_by(buf, len);
 	}
-
-	return 0;
 }
 
-int cmark_strbuf_printf(cmark_strbuf *buf, const char *format, ...)
+void cmark_strbuf_printf(cmark_strbuf *buf, const char *format, ...)
 {
-	int r;
 	va_list ap;
 
 	va_start(ap, format);
-	r = cmark_strbuf_vprintf(buf, format, ap);
+	cmark_strbuf_vprintf(buf, format, ap);
 	va_end(ap);
-
-	return r;
 }
 
-void cmark_strbuf_copy_cstr(char *data, int datasize, const cmark_strbuf *buf)
+void cmark_strbuf_copy_cstr(char *data, bufsize_t datasize, const cmark_strbuf *buf)
 {
-	int copylen;
+	bufsize_t copylen;
 
-	assert(data && datasize && buf);
+	assert(buf);
+	if (!data || datasize <= 0)
+		return;
 
 	data[0] = '\0';
 
@@ -236,7 +244,7 @@ unsigned char *cmark_strbuf_detach(cmark_strbuf *buf)
 {
 	unsigned char *data = buf->ptr;
 
-	if (buf->asize == 0 || buf->ptr == cmark_strbuf__oom) {
+	if (buf->asize == 0) {
 		/* return an empty string */
 		return (unsigned char *)calloc(1, 1);
 	}
@@ -245,22 +253,6 @@ unsigned char *cmark_strbuf_detach(cmark_strbuf *buf)
 	return data;
 }
 
-void cmark_strbuf_attach(cmark_strbuf *buf, unsigned char *ptr, int asize)
-{
-	cmark_strbuf_free(buf);
-
-	if (ptr) {
-		buf->ptr = ptr;
-		buf->size = strlen((char *)ptr);
-		if (asize)
-			buf->asize = (asize < buf->size) ? buf->size + 1 : asize;
-		else /* pass 0 to fall back on strlen + 1 */
-			buf->asize = buf->size + 1;
-	} else {
-		cmark_strbuf_grow(buf, asize);
-	}
-}
-
 int cmark_strbuf_cmp(const cmark_strbuf *a, const cmark_strbuf *b)
 {
 	int result = memcmp(a->ptr, b->ptr, MIN(a->size, b->size));
@@ -268,20 +260,28 @@ int cmark_strbuf_cmp(const cmark_strbuf *a, const cmark_strbuf *b)
 	       (a->size < b->size) ? -1 : (a->size > b->size) ? 1 : 0;
 }
 
-int cmark_strbuf_strchr(const cmark_strbuf *buf, int c, int pos)
+bufsize_t cmark_strbuf_strchr(const cmark_strbuf *buf, int c, bufsize_t pos)
 {
+	if (pos >= buf->size)
+		return -1;
+	if (pos < 0)
+		pos = 0;
+
 	const unsigned char *p = (unsigned char *)memchr(buf->ptr + pos, c, buf->size - pos);
 	if (!p)
 		return -1;
 
-	return (int)(p - (const unsigned char *)buf->ptr);
+	return (bufsize_t)(p - (const unsigned char *)buf->ptr);
 }
 
-int cmark_strbuf_strrchr(const cmark_strbuf *buf, int c, int pos)
+bufsize_t cmark_strbuf_strrchr(const cmark_strbuf *buf, int c, bufsize_t pos)
 {
-	int i;
+	if (pos < 0 || buf->size == 0)
+		return -1;
+	if (pos >= buf->size)
+		pos = buf->size - 1;
 
-	for (i = pos; i >= 0; i--) {
+	for (bufsize_t i = pos; i >= 0; i--) {
 		if (buf->ptr[i] == (unsigned char) c)
 			return i;
 	}
@@ -289,17 +289,22 @@ int cmark_strbuf_strrchr(const cmark_strbuf *buf, int c, int pos)
 	return -1;
 }
 
-void cmark_strbuf_truncate(cmark_strbuf *buf, int len)
+void cmark_strbuf_truncate(cmark_strbuf *buf, bufsize_t len)
 {
+	if (len < 0)
+		len = 0;
+
 	if (len < buf->size) {
 		buf->size = len;
 		buf->ptr[buf->size] = '\0';
 	}
 }
 
-void cmark_strbuf_drop(cmark_strbuf *buf, int n)
+void cmark_strbuf_drop(cmark_strbuf *buf, bufsize_t n)
 {
 	if (n > 0) {
+		if (n > buf->size)
+			n = buf->size;
 		buf->size = buf->size - n;
 		if (buf->size)
 			memmove(buf->ptr, buf->ptr + n, buf->size);
@@ -325,7 +330,7 @@ void cmark_strbuf_rtrim(cmark_strbuf *buf)
 
 void cmark_strbuf_trim(cmark_strbuf *buf)
 {
-	int i = 0;
+	bufsize_t i = 0;
 
 	if (!buf->size)
 		return;
@@ -343,7 +348,7 @@ void cmark_strbuf_trim(cmark_strbuf *buf)
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s)
 {
 	bool last_char_was_space = false;
-	int r, w;
+	bufsize_t r, w;
 
 	for (r = 0, w = 0; r < s->size; ++r) {
 		switch (s->ptr[r]) {
@@ -368,11 +373,11 @@ void cmark_strbuf_normalize_whitespace(cmark_strbuf *s)
 // Destructively unescape a string: remove backslashes before punctuation chars.
 extern void cmark_strbuf_unescape(cmark_strbuf *buf)
 {
-	int r, w;
+	bufsize_t r, w;
 
 	for (r = 0, w = 0; r < buf->size; ++r) {
 		if (buf->ptr[r] == '\\' && cmark_ispunct(buf->ptr[r + 1]))
-			continue;
+			r++;
 
 		buf->ptr[w++] = buf->ptr[r];
 	}

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/buffer.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/buffer.h b/compiler/modules/CommonMark/src/buffer.h
index fb9f910..babd051 100644
--- a/compiler/modules/CommonMark/src/buffer.h
+++ b/compiler/modules/CommonMark/src/buffer.h
@@ -3,22 +3,25 @@
 
 #include <stddef.h>
 #include <stdarg.h>
+#include <string.h>
+#include <limits.h>
 #include "config.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef int bufsize_t;
+
 typedef struct {
 	unsigned char *ptr;
-	int asize, size;
+	bufsize_t asize, size;
 } cmark_strbuf;
 
 extern unsigned char cmark_strbuf__initbuf[];
 
-extern unsigned char cmark_strbuf__oom[];
-
 #define GH_BUF_INIT { cmark_strbuf__initbuf, 0, 0 }
+#define BUFSIZE_MAX INT_MAX
 
 /**
  * Initialize a cmark_strbuf structure.
@@ -26,51 +29,22 @@ extern unsigned char cmark_strbuf__oom[];
  * For the cases where GH_BUF_INIT cannot be used to do static
  * initialization.
  */
-void cmark_strbuf_init(cmark_strbuf *buf, int initial_size);
-
-/**
- * Attempt to grow the buffer to hold at least `target_size` bytes.
- *
- * If the allocation fails, this will return an error.  If mark_oom is true,
- * this will mark the buffer as invalid for future operations; if false,
- * existing buffer content will be preserved, but calling code must handle
- * that buffer was not expanded.
- */
-int cmark_strbuf_try_grow(cmark_strbuf *buf, int target_size, bool mark_oom);
+void cmark_strbuf_init(cmark_strbuf *buf, bufsize_t initial_size);
 
 /**
  * Grow the buffer to hold at least `target_size` bytes.
- *
- * If the allocation fails, this will return an error and the buffer will be
- * marked as invalid for future operations, invaliding contents.
- *
- * @return 0 on success or -1 on failure
  */
-int cmark_strbuf_grow(cmark_strbuf *buf, int target_size);
+void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size);
 
 void cmark_strbuf_free(cmark_strbuf *buf);
 void cmark_strbuf_swap(cmark_strbuf *buf_a, cmark_strbuf *buf_b);
 
-/**
- * Test if there have been any reallocation failures with this cmark_strbuf.
- *
- * Any function that writes to a cmark_strbuf can fail due to memory allocation
- * issues.  If one fails, the cmark_strbuf will be marked with an OOM error and
- * further calls to modify the buffer will fail.  Check cmark_strbuf_oom() at the
- * end of your sequence and it will be true if you ran out of memory at any
- * point with that buffer.
- *
- * @return false if no error, true if allocation error
- */
-bool cmark_strbuf_oom(const cmark_strbuf *buf);
-
-size_t cmark_strbuf_len(const cmark_strbuf *buf);
+bufsize_t cmark_strbuf_len(const cmark_strbuf *buf);
 
 int cmark_strbuf_cmp(const cmark_strbuf *a, const cmark_strbuf *b);
 
-void cmark_strbuf_attach(cmark_strbuf *buf, unsigned char *ptr, int asize);
 unsigned char *cmark_strbuf_detach(cmark_strbuf *buf);
-void cmark_strbuf_copy_cstr(char *data, int datasize, const cmark_strbuf *buf);
+void cmark_strbuf_copy_cstr(char *data, bufsize_t datasize, const cmark_strbuf *buf);
 
 static inline const char *cmark_strbuf_cstr(const cmark_strbuf *buf)
 {
@@ -79,33 +53,41 @@ static inline const char *cmark_strbuf_cstr(const cmark_strbuf *buf)
 
 #define cmark_strbuf_at(buf, n) ((buf)->ptr[n])
 
-/*
- * Functions below that return int value error codes will return 0 on
- * success or -1 on failure (which generally means an allocation failed).
- * Using a cmark_strbuf where the allocation has failed with result in -1 from
- * all further calls using that buffer.  As a result, you can ignore the
- * return code of these functions and call them in a series then just call
- * cmark_strbuf_oom at the end.
- */
-int cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data, int len);
-int cmark_strbuf_sets(cmark_strbuf *buf, const char *string);
-int cmark_strbuf_putc(cmark_strbuf *buf, int c);
-int cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data, int len);
-int cmark_strbuf_puts(cmark_strbuf *buf, const char *string);
-int cmark_strbuf_printf(cmark_strbuf *buf, const char *format, ...)
-CMARK_ATTRIBUTE((format (printf, 2, 3)));
-int cmark_strbuf_vprintf(cmark_strbuf *buf, const char *format, va_list ap);
+void cmark_strbuf_set(cmark_strbuf *buf, const unsigned char *data, bufsize_t len);
+void cmark_strbuf_sets(cmark_strbuf *buf, const char *string);
+void cmark_strbuf_putc(cmark_strbuf *buf, int c);
+void cmark_strbuf_put(cmark_strbuf *buf, const unsigned char *data, bufsize_t len);
+void cmark_strbuf_puts(cmark_strbuf *buf, const char *string);
+void cmark_strbuf_printf(cmark_strbuf *buf, const char *format, ...)
+	CMARK_ATTRIBUTE((format (printf, 2, 3)));
+void cmark_strbuf_vprintf(cmark_strbuf *buf, const char *format, va_list ap);
 void cmark_strbuf_clear(cmark_strbuf *buf);
 
-int cmark_strbuf_strchr(const cmark_strbuf *buf, int c, int pos);
-int cmark_strbuf_strrchr(const cmark_strbuf *buf, int c, int pos);
-void cmark_strbuf_drop(cmark_strbuf *buf, int n);
-void cmark_strbuf_truncate(cmark_strbuf *buf, int len);
+bufsize_t cmark_strbuf_strchr(const cmark_strbuf *buf, int c, bufsize_t pos);
+bufsize_t cmark_strbuf_strrchr(const cmark_strbuf *buf, int c, bufsize_t pos);
+void cmark_strbuf_drop(cmark_strbuf *buf, bufsize_t n);
+void cmark_strbuf_truncate(cmark_strbuf *buf, bufsize_t len);
 void cmark_strbuf_rtrim(cmark_strbuf *buf);
 void cmark_strbuf_trim(cmark_strbuf *buf);
 void cmark_strbuf_normalize_whitespace(cmark_strbuf *s);
 void cmark_strbuf_unescape(cmark_strbuf *s);
 
+/* Print error and abort. */
+void cmark_strbuf_overflow_err(void);
+
+static inline bufsize_t
+cmark_strbuf_check_bufsize(size_t size) {
+	if (size > BUFSIZE_MAX) {
+		cmark_strbuf_overflow_err();
+	}
+	return (bufsize_t)size;
+}
+
+static inline bufsize_t
+cmark_strbuf_safe_strlen(const char *str) {
+	return cmark_strbuf_check_bufsize(strlen(str));
+}
+
 #ifdef __cplusplus
 }
 #endif

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/chunk.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/chunk.h b/compiler/modules/CommonMark/src/chunk.h
index 54c4b16..f23a02d 100644
--- a/compiler/modules/CommonMark/src/chunk.h
+++ b/compiler/modules/CommonMark/src/chunk.h
@@ -7,10 +7,12 @@
 #include "cmark_ctype.h"
 #include "buffer.h"
 
+#define CMARK_CHUNK_EMPTY { NULL, 0, 0 }
+
 typedef struct {
 	unsigned char *data;
-	int len;
-	int alloc;  // also implies a NULL-terminated string
+	bufsize_t len;
+	bufsize_t alloc;  // also implies a NULL-terminated string
 } cmark_chunk;
 
 static inline void cmark_chunk_free(cmark_chunk *c)
@@ -49,10 +51,10 @@ static inline void cmark_chunk_trim(cmark_chunk *c)
 	cmark_chunk_rtrim(c);
 }
 
-static inline int cmark_chunk_strchr(cmark_chunk *ch, int c, int offset)
+static inline bufsize_t cmark_chunk_strchr(cmark_chunk *ch, int c, bufsize_t offset)
 {
 	const unsigned char *p = (unsigned char *)memchr(ch->data + offset, c, ch->len - offset);
-	return p ? (int)(p - ch->data) : ch->len;
+	return p ? (bufsize_t)(p - ch->data) : ch->len;
 }
 
 static inline const char *cmark_chunk_to_cstr(cmark_chunk *c)
@@ -64,7 +66,9 @@ static inline const char *cmark_chunk_to_cstr(cmark_chunk *c)
 	}
 	str = (unsigned char *)malloc(c->len + 1);
 	if(str != NULL) {
-		memcpy(str, c->data, c->len);
+		if(c->len > 0) {
+			memcpy(str, c->data, c->len);
+		}
 		str[c->len] = 0;
 	}
 	c->data  = str;
@@ -78,19 +82,26 @@ static inline void cmark_chunk_set_cstr(cmark_chunk *c, const char *str)
 	if (c->alloc) {
 		free(c->data);
 	}
-	c->len   = strlen(str);
-	c->data  = (unsigned char *)malloc(c->len + 1);
-	c->alloc = 1;
-	memcpy(c->data, str, c->len + 1);
+	if (str == NULL) {
+		c->len   = 0;
+		c->data  = NULL;
+		c->alloc = 0;
+	} else {
+		c->len   = cmark_strbuf_safe_strlen(str);
+		c->data  = (unsigned char *)malloc(c->len + 1);
+		c->alloc = 1;
+		memcpy(c->data, str, c->len + 1);
+	}
 }
 
 static inline cmark_chunk cmark_chunk_literal(const char *data)
 {
-	cmark_chunk c = {(unsigned char *)data, data ? strlen(data) : 0, 0};
+	bufsize_t len = data ? cmark_strbuf_safe_strlen(data) : 0;
+	cmark_chunk c = {(unsigned char *)data, len, 0};
 	return c;
 }
 
-static inline cmark_chunk cmark_chunk_dup(const cmark_chunk *ch, int pos, int len)
+static inline cmark_chunk cmark_chunk_dup(const cmark_chunk *ch, bufsize_t pos, bufsize_t len)
 {
 	cmark_chunk c = {ch->data + pos, len, 0};
 	return c;

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/cmark.c
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/cmark.c b/compiler/modules/CommonMark/src/cmark.c
index 1d7a500..ca9807b 100644
--- a/compiler/modules/CommonMark/src/cmark.c
+++ b/compiler/modules/CommonMark/src/cmark.c
@@ -6,14 +6,24 @@
 #include "cmark.h"
 #include "buffer.h"
 
-char *cmark_markdown_to_html(const char *text, int len)
+int cmark_version()
+{
+	return CMARK_VERSION;
+}
+
+const char *cmark_version_string()
+{
+	return CMARK_VERSION_STRING;
+}
+
+char *cmark_markdown_to_html(const char *text, size_t len, int options)
 {
 	cmark_node *doc;
 	char *result;
 
-	doc = cmark_parse_document(text, len);
+	doc = cmark_parse_document(text, len, options);
 
-	result = cmark_render_html(doc, CMARK_OPT_DEFAULT);
+	result = cmark_render_html(doc, options);
 	cmark_node_free(doc);
 
 	return result;

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/cmark.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/cmark.h b/compiler/modules/CommonMark/src/cmark.h
index 04ca6d7..4a85f26 100644
--- a/compiler/modules/CommonMark/src/cmark.h
+++ b/compiler/modules/CommonMark/src/cmark.h
@@ -2,7 +2,8 @@
 #define CMARK_H
 
 #include <stdio.h>
-#include "cmark_export.h"
+#include <cmark_export.h>
+#include <cmark_version.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -18,16 +19,12 @@ extern "C" {
  * ## Simple Interface
  */
 
-/** Current version of library.
- */
-#define CMARK_VERSION "0.1"
-
 /** Convert 'text' (assumed to be a UTF-8 encoded string with length
  * 'len' from CommonMark Markdown to HTML, returning a null-terminated,
  * UTF-8-encoded string.
  */
 CMARK_EXPORT
-char *cmark_markdown_to_html(const char *text, int len);
+char *cmark_markdown_to_html(const char *text, size_t len, int options);
 
 /** ## Node Structure
  */
@@ -213,6 +210,12 @@ CMARK_EXPORT
 cmark_event_type
 cmark_iter_get_event_type(cmark_iter *iter);
 
+/** Returns the root node.
+ */
+CMARK_EXPORT
+cmark_node*
+cmark_iter_get_root(cmark_iter *iter);
+
 /** Resets the iterator so that the current node is 'current' and
  * the event type is 'event_type'.  The new current node must be a
  * descendant of the root node or the root node itself.
@@ -226,6 +229,17 @@ cmark_iter_reset(cmark_iter *iter, cmark_node *current,
  * ## Accessors
  */
 
+/** Returns the user data of 'node'.
+ */
+CMARK_EXPORT void*
+cmark_node_get_user_data(cmark_node *node);
+
+/** Sets arbitrary user data for 'node'.  Returns 1 on success,
+ * 0 on failure.
+ */
+CMARK_EXPORT int
+cmark_node_set_user_data(cmark_node *node, void *user_data);
+
 /** Returns the type of 'node', or `CMARK_NODE_NONE` on error.
  */
 CMARK_EXPORT cmark_node_type
@@ -398,11 +412,12 @@ cmark_consolidate_text_nodes(cmark_node *root);
  *
  * Simple interface:
  *
- *     cmark_node *document = cmark_parse_document("Hello *world*", 12);
+ *     cmark_node *document = cmark_parse_document("Hello *world*", 12,
+ *                                                 CMARK_OPT_DEFAULT);
  *
  * Streaming interface:
  *
- *     cmark_parser *parser = cmark_parser_new();
+ *     cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT);
  *     FILE *fp = fopen("myfile.md", "r");
  *     while ((bytes = fread(buffer, 1, sizeof(buffer), fp)) > 0) {
  *     	   cmark_parser_feed(parser, buffer, bytes);
@@ -417,7 +432,7 @@ cmark_consolidate_text_nodes(cmark_node *root);
 /** Creates a new parser object.
  */
 CMARK_EXPORT
-cmark_parser *cmark_parser_new();
+cmark_parser *cmark_parser_new(int options);
 
 /** Frees memory allocated for a parser object.
  */
@@ -438,13 +453,13 @@ cmark_node *cmark_parser_finish(cmark_parser *parser);
  * Returns a pointer to a tree of nodes.
  */
 CMARK_EXPORT
-cmark_node *cmark_parse_document(const char *buffer, size_t len);
+cmark_node *cmark_parse_document(const char *buffer, size_t len, int options);
 
 /** Parse a CommonMark document in file 'f', returning a pointer to
  * a tree of nodes.
  */
 CMARK_EXPORT
-cmark_node *cmark_parse_file(FILE *f);
+cmark_node *cmark_parse_file(FILE *f, int options);
 
 /**
  * ## Rendering
@@ -453,18 +468,28 @@ cmark_node *cmark_parse_file(FILE *f);
 /** Render a 'node' tree as XML.
  */
 CMARK_EXPORT
-char *cmark_render_xml(cmark_node *root, long options);
+char *cmark_render_xml(cmark_node *root, int options);
 
 /** Render a 'node' tree as an HTML fragment.  It is up to the user
  * to add an appropriate header and footer.
  */
 CMARK_EXPORT
-char *cmark_render_html(cmark_node *root, long options);
+char *cmark_render_html(cmark_node *root, int options);
 
 /** Render a 'node' tree as a groff man page, without the header.
  */
 CMARK_EXPORT
-char *cmark_render_man(cmark_node *root, long options);
+char *cmark_render_man(cmark_node *root, int options, int width);
+
+/** Render a 'node' tree as a commonmark document.
+ */
+CMARK_EXPORT
+char *cmark_render_commonmark(cmark_node *root, int options, int width);
+
+/** Render a 'node' tree as a LaTeX document.
+ */
+CMARK_EXPORT
+char *cmark_render_latex(cmark_node *root, int options, int width);
 
 /** Default writer options.
  */
@@ -482,6 +507,45 @@ char *cmark_render_man(cmark_node *root, long options);
  */
 #define CMARK_OPT_NORMALIZE 4
 
+/** Convert straight quotes to curly, --- to em dashes, -- to en dashes.
+ */
+#define CMARK_OPT_SMART 8
+
+/** Validate UTF-8 in the input before parsing, replacing illegal
+ * sequences with the replacement character U+FFFD.
+ */
+#define CMARK_OPT_VALIDATE_UTF8 16
+
+/** Suppress raw HTML and unsafe links (`javascript:`, `vbscript:`,
+ * `file:`, and `data:`, except for `image/png`, `image/gif`,
+ * `image/jpeg`, or `image/webp` mime types).  Raw HTML is replaced
+ * by a placeholder HTML comment. Unsafe links are replaced by
+ * empty strings.
+ */
+#define CMARK_OPT_SAFE 32
+
+/**
+ * ## Version information
+ */
+
+/** The library version as integer for runtime checks. Also available as
+ * macro CMARK_VERSION for compile time checks.
+ *
+ * * Bits 16-23 contain the major version.
+ * * Bits 8-15 contain the minor version.
+ * * Bits 0-7 contain the patchlevel.
+ *
+ * In hexadecimal format, the number 0x010203 represents version 1.2.3.
+ */
+CMARK_EXPORT
+int cmark_version();
+
+/** The library version string for runtime checks. Also available as
+ * macro CMARK_VERSION_STRING for compile time checks.
+ */
+CMARK_EXPORT
+const char *cmark_version_string();
+
 /** # AUTHORS
  *
  * John MacFarlane, Vicent Marti,  Kārlis Gaņģis, Nick Wellnhofer.
@@ -506,7 +570,6 @@ char *cmark_render_man(cmark_node *root, long options);
 #define NODE_STRONG               CMARK_NODE_STRONG
 #define NODE_LINK                 CMARK_NODE_LINK
 #define NODE_IMAGE                CMARK_NODE_IMAGE
-#define NODE_LINK_LABEL           CMARK_NODE_LINK_LABEL
 #define BULLET_LIST               CMARK_BULLET_LIST
 #define ORDERED_LIST              CMARK_ORDERED_LIST
 #define PERIOD_DELIM              CMARK_PERIOD_DELIM

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/89c7b809/compiler/modules/CommonMark/src/cmark_version.h
----------------------------------------------------------------------
diff --git a/compiler/modules/CommonMark/src/cmark_version.h b/compiler/modules/CommonMark/src/cmark_version.h
new file mode 100644
index 0000000..fb15ba5
--- /dev/null
+++ b/compiler/modules/CommonMark/src/cmark_version.h
@@ -0,0 +1,7 @@
+#ifndef CMARK_VERSION_H
+#define CMARK_VERSION_H
+
+#define CMARK_VERSION ((0 << 16) | (21 << 8)  | 0)
+#define CMARK_VERSION_STRING "0.21.0"
+
+#endif