Add support for lists in markdown

2026-04-30 01:32:30 +00:00 · 2023-05-22 14:14:17 +03:00
parent 469a731892
commit 6bbde4ccdc
7 changed files with 229 additions and 174 deletions
--- a/DiscordChatExporter.Core/Markdown/ListItemNode.cs
+++ b/DiscordChatExporter.Core/Markdown/ListItemNode.cs
@@ -0,0 +1,5 @@
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown;
+
+internal record ListItemNode(IReadOnlyList<MarkdownNode> Children) : MarkdownNode, IContainerNode;
--- a/DiscordChatExporter.Core/Markdown/ListNode.cs
+++ b/DiscordChatExporter.Core/Markdown/ListNode.cs
@@ -0,0 +1,5 @@
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown;
+
+internal record ListNode(IReadOnlyList<ListItemNode> Items) : MarkdownNode;
--- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs
+++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownParser.cs
@@ -24,13 +24,13 @@ internal static partial class MarkdownParser
    /* Formatting */

    private static readonly IMatcher<MarkdownNode> BoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest double asterisk not followed by an asterisk.
+        // There must be exactly two closing asterisks.
        new Regex(@"\*\*(.+?)\*\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Bold, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> ItalicFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest single asterisk not preceded or followed by an asterisk.
+        // There must be exactly one closing asterisk.
        // Opening asterisk must not be followed by whitespace.
        // Closing asterisk must not be preceded by whitespace.
        new Regex(@"\*(?!\s)(.+?)(?<!\s|\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
@@ -38,27 +38,26 @@ internal static partial class MarkdownParser
    );

    private static readonly IMatcher<MarkdownNode> ItalicBoldFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest triple asterisk not followed by an asterisk.
+        // There must be exactly three closing asterisks.
        new Regex(@"\*(\*\*.+?\*\*)\*(?!\*)", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1]), BoldFormattingNodeMatcher))
    );

    private static readonly IMatcher<MarkdownNode> ItalicAltFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character except underscore until an underscore.
        // Closing underscore must not be followed by a word character.
-        new Regex(@"_([^_]+)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline),
+        new Regex(@"_(.+?)_(?!\w)", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Italic, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> UnderlineFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest double underscore not followed by an underscore.
+        // There must be exactly two closing underscores.
        new Regex(@"__(.+?)__(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Underline, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> ItalicUnderlineFormattingNodeMatcher =
        new RegexMatcher<MarkdownNode>(
-            // Capture any character until the earliest triple underscore not followed by an underscore.
+            // There must be exactly three closing underscores.
            new Regex(@"_(__.+?__)_(?!_)", DefaultRegexOptions | RegexOptions.Singleline),
            (s, m) => new FormattingNode(
                FormattingKind.Italic,
@@ -67,68 +66,61 @@ internal static partial class MarkdownParser
        );

    private static readonly IMatcher<MarkdownNode> StrikethroughFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest double tilde.
        new Regex(@"~~(.+?)~~", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Strikethrough, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> SpoilerFormattingNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the earliest double pipe.
        new Regex(@"\|\|(.+?)\|\|", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Spoiler, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> SingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the end of the line.
-        // Opening 'greater than' character must be followed by whitespace.
-        // Consume the newline character so that it's not included in the content.
+        // Include the linebreak in the content so that the lines are preserved in quotes.
        new Regex(@"^>\s(.+\n?)", DefaultRegexOptions),
        (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
    );

    private static readonly IMatcher<MarkdownNode> RepeatedSingleLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Repeatedly capture any character until the end of the line.
-        // Consume the newline character so that it's not included in the content.
+        // Include the linebreaks in the content, so that the lines are preserved in quotes.
        new Regex(@"(?:^>\s(.+\n?)){2,}", DefaultRegexOptions),
-        (_, m) => new FormattingNode(
+        (s, m) => new FormattingNode(
            FormattingKind.Quote,
-            Parse(
-                // Combine all captures into a single string
-                string.Concat(m.Groups[1].Captures.Select(c => c.Value))
-            )
+            m.Groups[1].Captures.SelectMany(c => Parse(s.Relocate(c))).ToArray()
        )
    );

    private static readonly IMatcher<MarkdownNode> MultiLineQuoteNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the end of the input.
-        // Opening 'greater than' characters must be followed by whitespace.
        new Regex(@"^>>>\s(.+)", DefaultRegexOptions | RegexOptions.Singleline),
        (s, m) => new FormattingNode(FormattingKind.Quote, Parse(s.Relocate(m.Groups[1])))
    );

-    /* Headers */
-
    private static readonly IMatcher<MarkdownNode> HeaderNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character until the end of the line.
-        // Opening 'hash' character(s) must be followed by whitespace.
-        // Consume the newline character so that it's not included in the content.
-        new Regex(@"^(\#{1,3})\s(.+\n?)", DefaultRegexOptions),
+        // Consume the linebreak so that it's not attached to following nodes.
+        new Regex(@"^(\#{1,3})\s(.+)\n", DefaultRegexOptions),
        (s, m) => new HeaderNode(m.Groups[1].Length, Parse(s.Relocate(m.Groups[2])))
    );

+    private static readonly IMatcher<MarkdownNode> ListNodeMatcher = new RegexMatcher<MarkdownNode>(
+        // Can be preceded by whitespace, which specifies the list's nesting level.
+        // Following lines that start with (level+1) whitespace are considered part of the list item.
+        // Consume the linebreak so that it's not attached to following nodes.
+        new Regex(@"^(\s*)(?:[\-\*]\s(.+(?:\n\s\1.*)*)?\n?)+", DefaultRegexOptions),
+        (s, m) => new ListNode(
+            m.Groups[2].Captures.Select(c => new ListItemNode(Parse(s.Relocate(c)))).ToArray()
+        )
+    );
+
    /* Code blocks */

    private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any character except backtick until a backtick.
-        // Blank lines at the beginning and at the end of content are trimmed.
-        // There can be either one or two backticks, but equal number on both sides.
+        // One or two backticks are allowed, but they must match on both sides.
        new Regex(@"(`{1,2})([^`]+)\1", DefaultRegexOptions | RegexOptions.Singleline),
-        (_, m) => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n'))
+        (_, m) => new InlineCodeBlockNode(m.Groups[2].Value)
    );

    private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture language identifier and then any character until the earliest triple backtick.
-        // Language identifier is one word immediately after opening backticks, followed immediately by newline.
+        // Language identifier is one word immediately after opening backticks, followed immediately by a linebreak.
        // Blank lines at the beginning and at the end of content are trimmed.
        new Regex(@"```(?:(\w*)\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
        (_, m) => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
@@ -215,7 +207,7 @@ internal static partial class MarkdownParser
    );

    private static readonly IMatcher<MarkdownNode> CodedStandardEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture :thinking: for known emoji codes
+        // Capture :thinking:
        new Regex(@":([\w_]+):", DefaultRegexOptions),
        (_, m) => EmojiIndex.TryGetName(m.Groups[1].Value)?.Pipe(n => new EmojiNode(n))
    );
@@ -233,8 +225,8 @@ internal static partial class MarkdownParser
    /* Links */

    private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
-        // Capture any non-whitespace character after http:// or https://
-        // until the last punctuation character or whitespace
+        // Any non-whitespace character after http:// or https://
+        // until the last punctuation character or whitespace.
        new Regex(@"(https?://\S*[^\.,:;""'\s])", DefaultRegexOptions),
        (_, m) => new LinkNode(m.Groups[1].Value)
    );
@@ -318,8 +310,7 @@ internal static partial class MarkdownParser
        }
    );

-    // Combine all matchers into one.
-    // Matchers that have similar patterns are ordered from most specific to least specific.
+    // Matchers that have similar patterns are ordered from most specific to least specific
    private static readonly IMatcher<MarkdownNode> NodeMatcher = new AggregateMatcher<MarkdownNode>(
        // Escaped text
        ShrugTextNodeMatcher,
@@ -339,9 +330,8 @@ internal static partial class MarkdownParser
        MultiLineQuoteNodeMatcher,
        RepeatedSingleLineQuoteNodeMatcher,
        SingleLineQuoteNodeMatcher,
-
-        // Headers
        HeaderNodeMatcher,
+        ListNodeMatcher,

        // Code blocks
        MultiLineCodeBlockNodeMatcher,
--- a/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs
+++ b/DiscordChatExporter.Core/Markdown/Parsing/MarkdownVisitor.cs
@@ -7,93 +7,127 @@ namespace DiscordChatExporter.Core.Markdown.Parsing;

 internal abstract class MarkdownVisitor
 {
-    protected virtual ValueTask<MarkdownNode> VisitTextAsync(
+    protected virtual ValueTask VisitTextAsync(
        TextNode text,
-        CancellationToken cancellationToken = default) =>
-        new(text);
+        CancellationToken cancellationToken = default) => default;

-    protected virtual async ValueTask<MarkdownNode> VisitFormattingAsync(
+    protected virtual async ValueTask VisitFormattingAsync(
        FormattingNode formatting,
-        CancellationToken cancellationToken = default)
-    {
+        CancellationToken cancellationToken = default) =>
        await VisitAsync(formatting.Children, cancellationToken);
-        return formatting;
-    }

-    protected virtual async ValueTask<MarkdownNode> VisitHeaderAsync(
+    protected virtual async ValueTask VisitHeaderAsync(
        HeaderNode header,
-        CancellationToken cancellationToken = default)
-    {
+        CancellationToken cancellationToken = default) =>
        await VisitAsync(header.Children, cancellationToken);
-        return header;
-    }

-    protected virtual ValueTask<MarkdownNode> VisitInlineCodeBlockAsync(
+    protected virtual async ValueTask VisitListAsync(
+        ListNode list,
+        CancellationToken cancellationToken = default) =>
+        await VisitAsync(list.Items, cancellationToken);
+
+    protected virtual async ValueTask VisitListItemAsync(
+        ListItemNode listItem,
+        CancellationToken cancellationToken = default) =>
+        await VisitAsync(listItem.Children, cancellationToken);
+
+    protected virtual ValueTask VisitInlineCodeBlockAsync(
        InlineCodeBlockNode inlineCodeBlock,
-        CancellationToken cancellationToken = default) =>
-        new(inlineCodeBlock);
+        CancellationToken cancellationToken = default) => default;

-    protected virtual ValueTask<MarkdownNode> VisitMultiLineCodeBlockAsync(
+    protected virtual ValueTask VisitMultiLineCodeBlockAsync(
        MultiLineCodeBlockNode multiLineCodeBlock,
-        CancellationToken cancellationToken = default) =>
-        new(multiLineCodeBlock);
+        CancellationToken cancellationToken = default) => default;

-    protected virtual async ValueTask<MarkdownNode> VisitLinkAsync(
+    protected virtual async ValueTask VisitLinkAsync(
        LinkNode link,
+        CancellationToken cancellationToken = default) =>
+        await VisitAsync(link.Children, cancellationToken);
+
+    protected virtual ValueTask VisitEmojiAsync(
+        EmojiNode emoji,
+        CancellationToken cancellationToken = default) => default;
+
+    protected virtual ValueTask VisitMentionAsync(
+        MentionNode mention,
+        CancellationToken cancellationToken = default) => default;
+
+    protected virtual ValueTask VisitTimestampAsync(
+        TimestampNode timestamp,
+        CancellationToken cancellationToken = default) => default;
+
+    public async ValueTask VisitAsync(
+        MarkdownNode node,
        CancellationToken cancellationToken = default)
    {
-        await VisitAsync(link.Children, cancellationToken);
-        return link;
-    }
-
-    protected virtual ValueTask<MarkdownNode> VisitEmojiAsync(
-        EmojiNode emoji,
-        CancellationToken cancellationToken = default) =>
-        new(emoji);
-
-    protected virtual ValueTask<MarkdownNode> VisitMentionAsync(
-        MentionNode mention,
-        CancellationToken cancellationToken = default) =>
-        new(mention);
-
-    protected virtual ValueTask<MarkdownNode> VisitTimestampAsync(
-        TimestampNode timestamp,
-        CancellationToken cancellationToken = default) =>
-        new(timestamp);
-
-    public async ValueTask<MarkdownNode> VisitAsync(
-        MarkdownNode node,
-        CancellationToken cancellationToken = default) => node switch
+        if (node is TextNode text)
        {
-            TextNode text =>
-                await VisitTextAsync(text, cancellationToken),
+            await VisitTextAsync(text, cancellationToken);
+            return;
+        }

-            FormattingNode formatting =>
-                await VisitFormattingAsync(formatting, cancellationToken),
+        if (node is FormattingNode formatting)
+        {
+            await VisitFormattingAsync(formatting, cancellationToken);
+            return;
+        }

-            HeaderNode header =>
-                await VisitHeaderAsync(header, cancellationToken),
+        if (node is HeaderNode header)
+        {
+            await VisitHeaderAsync(header, cancellationToken);
+            return;
+        }

-            InlineCodeBlockNode inlineCodeBlock =>
-                await VisitInlineCodeBlockAsync(inlineCodeBlock, cancellationToken),
+        if (node is ListNode list)
+        {
+            await VisitListAsync(list, cancellationToken);
+            return;
+        }

-            MultiLineCodeBlockNode multiLineCodeBlock =>
-                await VisitMultiLineCodeBlockAsync(multiLineCodeBlock, cancellationToken),
+        if (node is ListItemNode listItem)
+        {
+            await VisitListItemAsync(listItem, cancellationToken);
+            return;
+        }

-            LinkNode link =>
-                await VisitLinkAsync(link, cancellationToken),
+        if (node is InlineCodeBlockNode inlineCodeBlock)
+        {
+            await VisitInlineCodeBlockAsync(inlineCodeBlock, cancellationToken);
+            return;
+        }

-            EmojiNode emoji =>
-                await VisitEmojiAsync(emoji, cancellationToken),
+        if (node is MultiLineCodeBlockNode multiLineCodeBlock)
+        {
+            await VisitMultiLineCodeBlockAsync(multiLineCodeBlock, cancellationToken);
+            return;
+        }

-            MentionNode mention =>
-                await VisitMentionAsync(mention, cancellationToken),
+        if (node is LinkNode link)
+        {
+            await VisitLinkAsync(link, cancellationToken);
+            return;
+        }

-            TimestampNode timestamp =>
-                await VisitTimestampAsync(timestamp, cancellationToken),
+        if (node is EmojiNode emoji)
+        {
+            await VisitEmojiAsync(emoji, cancellationToken);
+            return;
+        }

-            _ => throw new ArgumentOutOfRangeException(nameof(node))
-        };
+        if (node is MentionNode mention)
+        {
+            await VisitMentionAsync(mention, cancellationToken);
+            return;
+        }
+
+        if (node is TimestampNode timestamp)
+        {
+            await VisitTimestampAsync(timestamp, cancellationToken);
+            return;
+        }
+
+        throw new ArgumentOutOfRangeException(nameof(node));
+    }

    public async ValueTask VisitAsync(
        IEnumerable<MarkdownNode> nodes,