From 2eb8ca8f2f4463448806663d72218cdcc0a9f38c Mon Sep 17 00:00:00 2001 From: Bas Alberts Date: Thu, 6 Apr 2023 14:42:40 -0400 Subject: [PATCH] Update cmark-upstream to https://github.com/github/cmark-gfm/commit/c8dcdc71ce1e2028d2d4d77c09c156512ffb3fa2 --- ext/commonmarker/blocks.c | 131 +++++----------------------- ext/commonmarker/cmark-gfm.h | 21 ++--- ext/commonmarker/cmark-upstream | 2 +- ext/commonmarker/commonmark.c | 3 +- ext/commonmarker/man.c | 3 +- ext/commonmarker/node.c | 27 +++++- ext/commonmarker/node.h | 9 +- ext/commonmarker/parser.h | 39 --------- ext/commonmarker/plaintext.c | 3 +- ext/commonmarker/render.c | 13 ++- ext/commonmarker/render.h | 1 - ext/commonmarker/syntax_extension.c | 5 +- ext/commonmarker/table.c | 6 -- 13 files changed, 80 insertions(+), 183 deletions(-) diff --git a/ext/commonmarker/blocks.c b/ext/commonmarker/blocks.c index b5a02b7d..03a58748 100644 --- a/ext/commonmarker/blocks.c +++ b/ext/commonmarker/blocks.c @@ -27,6 +27,14 @@ #define CODE_INDENT 4 #define TAB_STOP 4 +/** + * Very deeply nested lists can cause quadratic performance issues. + * This constant is used in open_new_blocks() to limit the nesting + * depth. It is unlikely that a non-contrived markdown document will + * be nested this deeply. + */ +#define MAX_LIST_DEPTH 100 + #ifndef MIN #define MIN(x, y) ((x < y) ? x : y) #endif @@ -70,22 +78,6 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, static void S_process_line(cmark_parser *parser, const unsigned char *buffer, bufsize_t bytes); -static void subtract_open_block_counts(cmark_parser *parser, cmark_node *node) { - do { - decr_open_block_count(parser, S_type(node)); - node->flags &= ~CMARK_NODE__OPEN_BLOCK; - node = node->last_child; - } while (node); -} - -static void add_open_block_counts(cmark_parser *parser, cmark_node *node) { - do { - incr_open_block_count(parser, S_type(node)); - node->flags |= CMARK_NODE__OPEN_BLOCK; - node = node->last_child; - } while (node); -} - static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag, int start_line, int start_column) { cmark_node *e; @@ -145,7 +137,6 @@ static void cmark_parser_reset(cmark_parser *parser) { parser->refmap = cmark_reference_map_new(parser->mem); parser->root = document; parser->current = document; - add_open_block_counts(parser, document); parser->syntax_extensions = saved_exts; parser->inline_syntax_extensions = saved_inline_exts; @@ -259,18 +250,15 @@ static void remove_trailing_blank_lines(cmark_strbuf *ln) { // Check to see if a node ends with a blank line, descending // if needed into lists and sublists. static bool S_ends_with_blank_line(cmark_node *node) { - while (true) { - if (S_last_line_checked(node)) { - return(S_last_line_blank(node)); - } else if ((S_type(node) == CMARK_NODE_LIST || - S_type(node) == CMARK_NODE_ITEM) && node->last_child) { - S_set_last_line_checked(node); - node = node->last_child; - continue; - } else { - S_set_last_line_checked(node); - return (S_last_line_blank(node)); - } + if (S_last_line_checked(node)) { + return(S_last_line_blank(node)); + } else if ((S_type(node) == CMARK_NODE_LIST || + S_type(node) == CMARK_NODE_ITEM) && node->last_child) { + S_set_last_line_checked(node); + return(S_ends_with_blank_line(node->last_child)); + } else { + S_set_last_line_checked(node); + return (S_last_line_blank(node)); } } @@ -330,12 +318,6 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { has_content = resolve_reference_link_definitions(parser, b); if (!has_content) { // remove blank node (former reference def) - if (b->flags & CMARK_NODE__OPEN_BLOCK) { - decr_open_block_count(parser, S_type(b)); - if (b->prev) { - add_open_block_counts(parser, b->prev); - } - } cmark_node_free(b); } break; @@ -408,17 +390,6 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { return parent; } -// Recalculates the number of open blocks. Returns true if it matches what's currently stored -// in parser. (Used to check that the counts in parser, which are updated incrementally, are -// correct.) -bool check_open_block_counts(cmark_parser *parser) { - cmark_parser tmp_parser = {0}; // Only used for its open_block_counts and total_open_blocks fields. - add_open_block_counts(&tmp_parser, parser->root); - return - tmp_parser.total_open_blocks == parser->total_open_blocks && - memcmp(tmp_parser.open_block_counts, parser->open_block_counts, sizeof(parser->open_block_counts)) == 0; -} - // Add a node as child of another. Return pointer to child. static cmark_node *add_child(cmark_parser *parser, cmark_node *parent, cmark_node_type block_type, int start_column) { @@ -437,14 +408,11 @@ static cmark_node *add_child(cmark_parser *parser, cmark_node *parent, if (parent->last_child) { parent->last_child->next = child; child->prev = parent->last_child; - subtract_open_block_counts(parser, parent->last_child); } else { parent->first_child = child; child->prev = NULL; } parent->last_child = child; - add_open_block_counts(parser, child); - return child; } @@ -1087,14 +1055,8 @@ static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input, *all_matched = false; cmark_node *container = parser->root; cmark_node_type cont_type; - cmark_parser tmp_parser; // Only used for its open_block_counts and total_open_blocks fields. - memcpy(tmp_parser.open_block_counts, parser->open_block_counts, sizeof(parser->open_block_counts)); - tmp_parser.total_open_blocks = parser->total_open_blocks; - - assert(check_open_block_counts(parser)); while (S_last_child_is_open(container)) { - decr_open_block_count(&tmp_parser, S_type(container)); container = container->last_child; cont_type = S_type(container); @@ -1106,53 +1068,6 @@ static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input, continue; } - // This block of code is a workaround for the quadratic performance - // issue described here (issue 2): - // - // https://github.com/github/cmark-gfm/security/advisories/GHSA-66g8-4hjf-77xh - // - // If the current line is empty then we might be able to skip directly - // to the end of the list of open blocks. To determine whether this is - // possible, we have been maintaining a count of the number of - // different types of open blocks. The main criterium is that every - // remaining block, except the last element of the list, is a LIST or - // ITEM. The code below checks the conditions, and if they're ok, skips - // forward to parser->current. - if (parser->blank && parser->indent == 0) { // Current line is empty - // Make sure that parser->current doesn't point to a closed block. - if (parser->current->flags & CMARK_NODE__OPEN_BLOCK) { - if (parser->current->flags & CMARK_NODE__OPEN) { - const size_t n_list = read_open_block_count(&tmp_parser, CMARK_NODE_LIST); - const size_t n_item = read_open_block_count(&tmp_parser, CMARK_NODE_ITEM); - // At most one block can be something other than a LIST or ITEM. - if (n_list + n_item + 1 >= tmp_parser.total_open_blocks) { - // Check that parser->current is suitable for jumping to. - switch (S_type(parser->current)) { - case CMARK_NODE_LIST: - case CMARK_NODE_ITEM: - if (n_list + n_item != tmp_parser.total_open_blocks) { - if (parser->current->last_child == NULL) { - // There's another node type somewhere in the middle of - // the list, so don't attempt the optimization. - break; - } - } - // fall through - case CMARK_NODE_CODE_BLOCK: - case CMARK_NODE_PARAGRAPH: - case CMARK_NODE_HTML_BLOCK: - // Jump to parser->current - container = parser->current; - cont_type = S_type(container); - break; - default: - break; - } - } - } - } - } - switch (cont_type) { case CMARK_NODE_BLOCK_QUOTE: if (!parse_block_quote_prefix(parser, input)) @@ -1212,10 +1127,11 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, bool has_content; int save_offset; int save_column; + size_t depth = 0; while (cont_type != CMARK_NODE_CODE_BLOCK && cont_type != CMARK_NODE_HTML_BLOCK) { - + depth++; S_find_first_nonspace(parser, input); indented = parser->indent >= CODE_INDENT; @@ -1286,9 +1202,8 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, has_content = resolve_reference_link_definitions(parser, *container); if (has_content) { - cmark_node_set_type(*container, CMARK_NODE_HEADING); - decr_open_block_count(parser, CMARK_NODE_PARAGRAPH); - incr_open_block_count(parser, CMARK_NODE_HEADING); + + (*container)->type = (uint16_t)CMARK_NODE_HEADING; (*container)->as.heading.level = lev; (*container)->as.heading.setext = true; S_advance_offset(parser, input, input->len - 1 - parser->offset, false); @@ -1318,6 +1233,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->internal_offset = matched; } else if ((!indented || cont_type == CMARK_NODE_LIST) && parser->indent < 4 && + depth < MAX_LIST_DEPTH && (matched = parse_list_marker( parser->mem, input, parser->first_nonspace, (*container)->type == CMARK_NODE_PARAGRAPH, &data))) { @@ -1443,7 +1359,7 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container, S_set_last_line_blank(container, last_line_blank); tmp = container; - while (tmp->parent && S_last_line_blank(tmp->parent)) { + while (tmp->parent) { S_set_last_line_blank(tmp->parent, false); tmp = tmp->parent; } @@ -1572,7 +1488,6 @@ static void S_process_line(cmark_parser *parser, const unsigned char *buffer, parser->line_number++; - assert(parser->current->next == NULL); last_matched_container = check_open_blocks(parser, &input, &all_matched); if (!last_matched_container) diff --git a/ext/commonmarker/cmark-gfm.h b/ext/commonmarker/cmark-gfm.h index 902c392d..0544057a 100644 --- a/ext/commonmarker/cmark-gfm.h +++ b/ext/commonmarker/cmark-gfm.h @@ -37,16 +37,6 @@ char *cmark_markdown_to_html(const char *text, size_t len, int options); #define CMARK_NODE_TYPE_MASK (0xc000) #define CMARK_NODE_VALUE_MASK (0x3fff) -/** - * This is the maximum number of block types (CMARK_NODE_DOCUMENT, - * CMARK_NODE_HEADING, ...). It needs to be bigger than the number of - * hardcoded block types (below) to allow for extensions (see - * cmark_syntax_extension_add_node). But it also determines the size of the - * open_block_counts array in the cmark_parser struct, so we don't want it - * to be excessively large. - */ -#define CMARK_NODE_TYPE_BLOCK_LIMIT 0x20 - typedef enum { /* Error status */ CMARK_NODE_NONE = 0x0000, @@ -423,6 +413,17 @@ CMARK_GFM_EXPORT int cmark_node_get_list_tight(cmark_node *node); */ CMARK_GFM_EXPORT int cmark_node_set_list_tight(cmark_node *node, int tight); +/** + * Returns item index of 'node'. This is only used when rendering output + * formats such as commonmark, which need to output the index. It is not + * required for formats such as html or latex. + */ +CMARK_GFM_EXPORT int cmark_node_get_item_index(cmark_node *node); + +/** Sets item index of 'node'. Returns 1 on success, 0 on failure. + */ +CMARK_GFM_EXPORT int cmark_node_set_item_index(cmark_node *node, int idx); + /** Returns the info string from a fenced code block. */ CMARK_GFM_EXPORT const char *cmark_node_get_fence_info(cmark_node *node); diff --git a/ext/commonmarker/cmark-upstream b/ext/commonmarker/cmark-upstream index dcf6b386..c8dcdc71 160000 --- a/ext/commonmarker/cmark-upstream +++ b/ext/commonmarker/cmark-upstream @@ -1 +1 @@ -Subproject commit dcf6b3862d2885b7d3dcdfc5b30c6a73526be32c +Subproject commit c8dcdc71ce1e2028d2d4d77c09c156512ffb3fa2 diff --git a/ext/commonmarker/commonmark.c b/ext/commonmarker/commonmark.c index f2210cdf..4815bfc3 100644 --- a/ext/commonmarker/commonmark.c +++ b/ext/commonmarker/commonmark.c @@ -216,14 +216,13 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, LIT(""); BLANKLINE(); } - renderer->list_number = cmark_node_get_list_start(node); break; case CMARK_NODE_ITEM: if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) { marker_width = 4; } else { - list_number = renderer->list_number++; + list_number = cmark_node_get_item_index(node); list_delim = cmark_node_get_list_delim(node->parent); // we ensure a width of at least 4 so // we get nice transition from single digits diff --git a/ext/commonmarker/man.c b/ext/commonmarker/man.c index e40e46ce..634fd9d0 100644 --- a/ext/commonmarker/man.c +++ b/ext/commonmarker/man.c @@ -113,7 +113,6 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, break; case CMARK_NODE_LIST: - renderer->list_number = cmark_node_get_list_start(node); break; case CMARK_NODE_ITEM: @@ -123,7 +122,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) { LIT("\\[bu] 2"); } else { - list_number = renderer->list_number++; + list_number = cmark_node_get_item_index(node); char list_number_s[LIST_NUMBER_SIZE]; snprintf(list_number_s, LIST_NUMBER_SIZE, "\"%d.\" 4", list_number); LIT(list_number_s); diff --git a/ext/commonmarker/node.c b/ext/commonmarker/node.c index 4866786d..67f657d8 100644 --- a/ext/commonmarker/node.c +++ b/ext/commonmarker/node.c @@ -39,7 +39,7 @@ void cmark_register_node_flag(cmark_node_internal_flags *flags) { nextflag <<= 1; } -void cmark_init_standard_node_flags() {} +void cmark_init_standard_node_flags(void) {} bool cmark_node_can_contain_type(cmark_node *node, cmark_node_type child_type) { if (child_type == CMARK_NODE_DOCUMENT) { @@ -564,6 +564,31 @@ int cmark_node_set_list_tight(cmark_node *node, int tight) { } } +int cmark_node_get_item_index(cmark_node *node) { + if (node == NULL) { + return 0; + } + + if (node->type == CMARK_NODE_ITEM) { + return node->as.list.start; + } else { + return 0; + } +} + +int cmark_node_set_item_index(cmark_node *node, int idx) { + if (node == NULL || idx < 0) { + return 0; + } + + if (node->type == CMARK_NODE_ITEM) { + node->as.list.start = idx; + return 1; + } else { + return 0; + } +} + const char *cmark_node_get_fence_info(cmark_node *node) { if (node == NULL) { return NULL; diff --git a/ext/commonmarker/node.h b/ext/commonmarker/node.h index ffebcb60..38ac4a6f 100644 --- a/ext/commonmarker/node.h +++ b/ext/commonmarker/node.h @@ -50,13 +50,12 @@ typedef struct { enum cmark_node__internal_flags { CMARK_NODE__OPEN = (1 << 0), - CMARK_NODE__OPEN_BLOCK = (1 << 1), - CMARK_NODE__LAST_LINE_BLANK = (1 << 2), - CMARK_NODE__LAST_LINE_CHECKED = (1 << 3), + CMARK_NODE__LAST_LINE_BLANK = (1 << 1), + CMARK_NODE__LAST_LINE_CHECKED = (1 << 2), // Extensions can register custom flags by calling `cmark_register_node_flag`. // This is the starting value for the custom flags. - CMARK_NODE__REGISTER_FIRST = (1 << 4), + CMARK_NODE__REGISTER_FIRST = (1 << 3), }; typedef uint16_t cmark_node_internal_flags; @@ -128,7 +127,7 @@ void cmark_register_node_flag(cmark_node_internal_flags *flags); * library. It is now a no-op. */ CMARK_GFM_EXPORT -void cmark_init_standard_node_flags(); +void cmark_init_standard_node_flags(void); static CMARK_INLINE cmark_mem *cmark_node_mem(cmark_node *node) { return node->content.mem; diff --git a/ext/commonmarker/parser.h b/ext/commonmarker/parser.h index 05403fe3..436c53f5 100644 --- a/ext/commonmarker/parser.h +++ b/ext/commonmarker/parser.h @@ -50,47 +50,8 @@ struct cmark_parser { cmark_llist *syntax_extensions; cmark_llist *inline_syntax_extensions; cmark_ispunct_func backslash_ispunct; - - /** - * The "open" blocks are the blocks visited by the loop in - * check_open_blocks (blocks.c). I.e. the blocks in this list: - * - * parser->root->last_child->...->last_child - * - * open_block_counts is used to keep track of how many of each type of - * node are currently in the open blocks list. Knowing these counts can - * sometimes help to end the loop in check_open_blocks early, improving - * efficiency. - * - * The count is stored at this offset: type - CMARK_NODE_TYPE_BLOCK - 1 - * For example, CMARK_NODE_LIST (0x8003) is stored at offset 2. - */ - size_t open_block_counts[CMARK_NODE_TYPE_BLOCK_LIMIT]; - size_t total_open_blocks; }; -static CMARK_INLINE void incr_open_block_count(cmark_parser *parser, cmark_node_type type) { - assert(type > CMARK_NODE_TYPE_BLOCK); - assert(type <= CMARK_NODE_TYPE_BLOCK + CMARK_NODE_TYPE_BLOCK_LIMIT); - parser->open_block_counts[type - CMARK_NODE_TYPE_BLOCK - 1]++; - parser->total_open_blocks++; -} - -static CMARK_INLINE void decr_open_block_count(cmark_parser *parser, cmark_node_type type) { - assert(type > CMARK_NODE_TYPE_BLOCK); - assert(type <= CMARK_NODE_TYPE_BLOCK + CMARK_NODE_TYPE_BLOCK_LIMIT); - assert(parser->open_block_counts[type - CMARK_NODE_TYPE_BLOCK - 1] > 0); - parser->open_block_counts[type - CMARK_NODE_TYPE_BLOCK - 1]--; - assert(parser->total_open_blocks > 0); - parser->total_open_blocks--; -} - -static CMARK_INLINE size_t read_open_block_count(cmark_parser *parser, cmark_node_type type) { - assert(type > CMARK_NODE_TYPE_BLOCK); - assert(type <= CMARK_NODE_TYPE_BLOCK + CMARK_NODE_TYPE_BLOCK_LIMIT); - return parser->open_block_counts[type - CMARK_NODE_TYPE_BLOCK - 1]; -} - #ifdef __cplusplus } #endif diff --git a/ext/commonmarker/plaintext.c b/ext/commonmarker/plaintext.c index a4047621..0c7d257b 100644 --- a/ext/commonmarker/plaintext.c +++ b/ext/commonmarker/plaintext.c @@ -61,14 +61,13 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, node->next->type == CMARK_NODE_LIST)) { CR(); } - renderer->list_number = cmark_node_get_list_start(node); break; case CMARK_NODE_ITEM: if (cmark_node_get_list_type(node->parent) == CMARK_BULLET_LIST) { marker_width = 4; } else { - list_number = renderer->list_number++; + list_number = cmark_node_get_item_index(node); list_delim = cmark_node_get_list_delim(node->parent); // we ensure a width of at least 4 so // we get nice transition from single digits diff --git a/ext/commonmarker/render.c b/ext/commonmarker/render.c index d7a83ebf..1a0d2ae8 100644 --- a/ext/commonmarker/render.c +++ b/ext/commonmarker/render.c @@ -171,8 +171,8 @@ char *cmark_render(cmark_mem *mem, cmark_node *root, int options, int width, cmark_renderer renderer = {mem, &buf, &pref, 0, width, 0, 0, true, true, false, - false, 0, outc, S_cr, S_blankline, - S_out, 0}; + false, outc, S_cr, S_blankline, S_out, + 0}; while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { cur = cmark_iter_get_node(iter); @@ -181,6 +181,15 @@ char *cmark_render(cmark_mem *mem, cmark_node *root, int options, int width, } else if (cur->parent) { cur->ancestor_extension = cur->parent->ancestor_extension; } + if (cur->type == CMARK_NODE_ITEM) { + // Calculate the list item's index, for the benefit of output formats + // like commonmark and plaintext. + if (cur->prev) { + cmark_node_set_item_index(cur, 1 + cmark_node_get_item_index(cur->prev)); + } else { + cmark_node_set_item_index(cur, cmark_node_get_list_start(cur->parent)); + } + } if (!render_node(&renderer, cur, ev_type, options)) { // a false value causes us to skip processing // the node's contents. this is used for diff --git a/ext/commonmarker/render.h b/ext/commonmarker/render.h index aa5162f9..4a68d1e0 100644 --- a/ext/commonmarker/render.h +++ b/ext/commonmarker/render.h @@ -23,7 +23,6 @@ struct cmark_renderer { bool begin_content; bool no_linebreaks; bool in_tight_list_item; - int list_number; void (*outc)(struct cmark_renderer *, cmark_node *, cmark_escaping, int32_t, unsigned char); void (*cr)(struct cmark_renderer *); void (*blankline)(struct cmark_renderer *); diff --git a/ext/commonmarker/syntax_extension.c b/ext/commonmarker/syntax_extension.c index a2fb3b04..d24fe43e 100644 --- a/ext/commonmarker/syntax_extension.c +++ b/ext/commonmarker/syntax_extension.c @@ -29,10 +29,7 @@ cmark_syntax_extension *cmark_syntax_extension_new(const char *name) { cmark_node_type cmark_syntax_extension_add_node(int is_inline) { cmark_node_type *ref = !is_inline ? &CMARK_NODE_LAST_BLOCK : &CMARK_NODE_LAST_INLINE; - if ((*ref & CMARK_NODE_VALUE_MASK) >= CMARK_NODE_TYPE_BLOCK_LIMIT) { - // This assertion will fail if you try to register more extensions than - // are currently allowed by CMARK_NODE_TYPE_BLOCK_MAXNUM. Try increasing - // the limit. + if ((*ref & CMARK_NODE_VALUE_MASK) == CMARK_NODE_VALUE_MASK) { assert(false); return (cmark_node_type) 0; } diff --git a/ext/commonmarker/table.c b/ext/commonmarker/table.c index 6e75e38d..e53ea315 100644 --- a/ext/commonmarker/table.c +++ b/ext/commonmarker/table.c @@ -311,18 +311,12 @@ static cmark_node *try_opening_table_header(cmark_syntax_extension *self, } } - assert(cmark_node_get_type(parent_container) == CMARK_NODE_PARAGRAPH); if (!cmark_node_set_type(parent_container, CMARK_NODE_TABLE)) { free_table_row(parser->mem, header_row); free_table_row(parser->mem, marker_row); return parent_container; } - // Update the node counts after parent_container changed type. - assert(parent_container->next == NULL); - decr_open_block_count(parser, CMARK_NODE_PARAGRAPH); - incr_open_block_count(parser, CMARK_NODE_TABLE); - if (header_row->paragraph_offset) { try_inserting_table_header_paragraph(parser, parent_container, (unsigned char *)parent_string, header_row->paragraph_offset);