Improve performance (#162)

2026-05-18 14:55:08 +00:00 · 2019-04-10 23:45:21 +03:00
parent 359278afec
commit 4bfb2ec7fd
86 changed files with 1242 additions and 900 deletions
--- a/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/AggregateMatcher.cs
@@ -0,0 +1,46 @@
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class AggregateMatcher<T> : IMatcher<T>
+    {
+        private readonly IReadOnlyList<IMatcher<T>> _matchers;
+
+        public AggregateMatcher(IReadOnlyList<IMatcher<T>> matchers)
+        {
+            _matchers = matchers;
+        }
+
+        public AggregateMatcher(params IMatcher<T>[] matchers)
+            : this((IReadOnlyList<IMatcher<T>>)matchers)
+        {
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            ParsedMatch<T> earliestMatch = null;
+
+            // Try to match the input with each matcher and get the match with the lowest start index
+            foreach (var matcher in _matchers)
+            {
+                // Try to match
+                var match = matcher.Match(input, startIndex, length);
+
+                // If there's no match - continue
+                if (match == null)
+                    continue;
+
+                // If this match is earlier than previous earliest - replace
+                if (earliestMatch == null || match.StartIndex < earliestMatch.StartIndex)
+                    earliestMatch = match;
+
+                // If the earliest match starts at the very beginning - break,
+                // because it's impossible to find a match earlier than that
+                if (earliestMatch.StartIndex == startIndex)
+                    break;
+            }
+
+            return earliestMatch;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/Extensions.cs
@@ -0,0 +1,50 @@
+using System;
+using System.Collections.Generic;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal static class Extensions
+    {
+        public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
+            int startIndex, int length, Func<string, T> fallbackTransform)
+        {
+            // Get end index for simplicity
+            var endIndex = startIndex + length;
+
+            // Loop through segments divided by individual matches
+            var currentIndex = startIndex;
+            while (currentIndex < endIndex)
+            {
+                // Find a match within this segment
+                var match = matcher.Match(input, currentIndex, endIndex - currentIndex);
+
+                // If there's no match - break
+                if (match == null)
+                    break;
+
+                // If this match doesn't start immediately at current index - transform and yield fallback first
+                if (match.StartIndex > currentIndex)
+                {
+                    var fallback = input.Substring(currentIndex, match.StartIndex - currentIndex);
+                    yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
+                }
+
+                // Yield match
+                yield return match;
+
+                // Shift current index to the end of the match
+                currentIndex = match.StartIndex + match.Length;
+            }
+
+            // If EOL wasn't reached - transform and yield remaining part as fallback
+            if (currentIndex < endIndex)
+            {
+                var fallback = input.Substring(currentIndex);
+                yield return new ParsedMatch<T>(currentIndex, fallback.Length, fallbackTransform(fallback));
+            }
+        }
+
+        public static IEnumerable<ParsedMatch<T>> MatchAll<T>(this IMatcher<T> matcher, string input,
+            Func<string, T> fallbackTransform) => matcher.MatchAll(input, 0, input.Length, fallbackTransform);
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/Grammar.cs
@@ -1,178 +0,0 @@
-using System.Collections.Generic;
-using System.Linq;
-using System.Text.RegularExpressions;
-using Sprache;
-using Tyrrrz.Extensions;
-
-namespace DiscordChatExporter.Core.Markdown.Internal
-{
-    // The following parsing logic is meant to replicate Discord's markdown grammar as close as possible
-    internal static class Grammar
-    {
-        /* Formatting */
-
-        // Capture until the earliest double asterisk not followed by an asterisk
-        private static readonly Parser<Node> BoldFormattedNode =
-            Parse.RegexMatch(new Regex("\\*\\*(.+?)\\*\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "**", TextFormatting.Bold, BuildTree(m.Groups[1].Value)));
-
-        // Capture until the earliest single asterisk not preceded or followed by an asterisk
-        // Can't have whitespace right after opening or right before closing asterisk
-        private static readonly Parser<Node> ItalicFormattedNode =
-            Parse.RegexMatch(new Regex("\\*(?!\\s)(.+?)(?<!\\s|\\*)\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Can't have underscores inside
-        // Can't have word characters right after closing underscore
-        private static readonly Parser<Node> ItalicAltFormattedNode =
-            Parse.RegexMatch(new Regex("_([^_]+?)_(?!\\w)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Treated as a separate entity for simplicity
-        // Capture until the earliest triple asterisk not preceded or followed by an asterisk
-        private static readonly Parser<Node> ItalicBoldFormattedNode =
-            Parse.RegexMatch(new Regex("\\*(\\*\\*(?:.+?)\\*\\*)\\*(?!\\*)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "*", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Capture until the earliest double underscore not followed by an underscore
-        private static readonly Parser<Node> UnderlineFormattedNode =
-            Parse.RegexMatch(new Regex("__(.+?)__(?!_)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "__", TextFormatting.Underline, BuildTree(m.Groups[1].Value)));
-
-        // Treated as a separate entity for simplicity
-        // Capture until the earliest triple underscore not preceded or followed by an underscore
-        private static readonly Parser<Node> ItalicUnderlineFormattedNode =
-            Parse.RegexMatch(new Regex("_(__(?:.+?)__)_(?!_)", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "_", TextFormatting.Italic, BuildTree(m.Groups[1].Value)));
-
-        // Strikethrough is safe
-        private static readonly Parser<Node> StrikethroughFormattedNode =
-            Parse.RegexMatch(new Regex("~~(.+?)~~", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "~~", TextFormatting.Strikethrough, BuildTree(m.Groups[1].Value)));
-
-        // Spoiler is safe
-        private static readonly Parser<Node> SpoilerFormattedNode =
-            Parse.RegexMatch(new Regex("\\|\\|(.+?)\\|\\|", RegexOptions.Singleline))
-                .Select(m => new FormattedNode(m.Value, "||", TextFormatting.Spoiler, BuildTree(m.Groups[1].Value)));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyFormattedNode = 
-            ItalicBoldFormattedNode.Or(ItalicUnderlineFormattedNode)
-            .Or(BoldFormattedNode).Or(ItalicFormattedNode)
-            .Or(UnderlineFormattedNode).Or(ItalicAltFormattedNode)
-            .Or(StrikethroughFormattedNode).Or(SpoilerFormattedNode);
-
-        /* Code blocks */
-
-        // Can't have backticks inside and surrounding whitespace is trimmed
-        private static readonly Parser<Node> InlineCodeBlockNode =
-            Parse.RegexMatch(new Regex("`\\s*([^`]+?)\\s*`", RegexOptions.Singleline))
-                .Select(m => new InlineCodeBlockNode(m.Value, m.Groups[1].Value));
-
-        // The first word is a language identifier if it's the only word followed by a newline, the rest is code
-        private static readonly Parser<Node> MultilineCodeBlockNode =
-            Parse.RegexMatch(new Regex("```(?:(\\w*?)?(?:\\s*?\\n))?(.+?)```", RegexOptions.Singleline))
-                .Select(m => new MultilineCodeBlockNode(m.Value, m.Groups[1].Value, m.Groups[2].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyCodeBlockNode = MultilineCodeBlockNode.Or(InlineCodeBlockNode);
-
-        /* Mentions */
-
-        // @everyone or @here
-        private static readonly Parser<Node> MetaMentionNode = Parse.RegexMatch("@(everyone|here)")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Meta));
-
-        // <@123456> or <@!123456>
-        private static readonly Parser<Node> UserMentionNode = Parse.RegexMatch("<@!?(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.User));
-
-        // <#123456>
-        private static readonly Parser<Node> ChannelMentionNode = Parse.RegexMatch("<#(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Channel));
-
-        // <@&123456>
-        private static readonly Parser<Node> RoleMentionNode = Parse.RegexMatch("<@&(\\d+)>")
-            .Select(m => new MentionNode(m.Value, m.Groups[1].Value, MentionType.Role));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyMentionNode =
-            MetaMentionNode.Or(UserMentionNode).Or(ChannelMentionNode).Or(RoleMentionNode);
-
-        /* Emojis */
-
-        // Matches all standard unicode emojis
-        private static readonly Parser<Node> StandardEmojiNode = Parse.RegexMatch(
-                "([\\u2700-\\u27bf]|" +
-                "(?:\\ud83c[\\udde6-\\uddff]){2}|" +
-                "[\\ud800-\\udbff][\\udc00-\\udfff]|" +
-                "[\\u0023-\\u0039]\\u20e3|" +
-                "\\u3299|\\u3297|\\u303d|\\u3030|\\u24c2|\\ud83c[\\udd70-\\udd71]|\\ud83c[\\udd7e-\\udd7f]|\\ud83c\\udd8e|\\ud83c[\\udd91-\\udd9a]|\\ud83c[\\udde6-\\uddff]|" +
-                "\\ud83c[\\ude01-\\ude02]|\\ud83c\\ude1a|\\ud83c\\ude2f|\\ud83c[\\ude32-\\ude3a]|\\ud83c[\\ude50-\\ude51]|\\u203c|\\u2049|[\\u25aa-\\u25ab]|" +
-                "\\u25b6|\\u25c0|[\\u25fb-\\u25fe]|\\u00a9|\\u00ae|\\u2122|\\u2139|\\ud83c\\udc04|[\\u2600-\\u26FF]|\\u2b05|\\u2b06|\\u2b07|\\u2b1b|\\u2b1c|\\u2b50|" +
-                "\\u2b55|\\u231a|\\u231b|\\u2328|\\u23cf|[\\u23e9-\\u23f3]|[\\u23f8-\\u23fa]|\\ud83c\\udccf|\\u2934|\\u2935|[\\u2190-\\u21ff])")
-            .Select(m => new EmojiNode(m.Value, m.Groups[1].Value));
-
-        // <:lul:123456> or <a:lul:123456>
-        private static readonly Parser<Node> CustomEmojiNode = Parse.RegexMatch("<(a)?:(.+?):(\\d+)>")
-            .Select(m => new EmojiNode(m.Value, m.Groups[3].Value, m.Groups[2].Value, m.Groups[1].Value.IsNotBlank()));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyEmojiNode = StandardEmojiNode.Or(CustomEmojiNode);
-
-        /* Links */
-
-        // [title](link)
-        private static readonly Parser<Node> TitledLinkNode = Parse.RegexMatch("\\[(.+?)\\]\\((.+?)\\)")
-            .Select(m => new LinkNode(m.Value, m.Groups[2].Value, m.Groups[1].Value));
-
-        // Starts with http:// or https://, stops at the last non-whitespace character followed by whitespace or punctuation character
-        private static readonly Parser<Node> AutoLinkNode = Parse.RegexMatch("(https?://\\S*[^\\.,:;\"\'\\s])")
-            .Select(m => new LinkNode(m.Value, m.Groups[1].Value));
-
-        // Autolink surrounded by angular brackets
-        private static readonly Parser<Node> HiddenLinkNode = Parse.RegexMatch("<(https?://\\S*[^\\.,:;\"\'\\s])>")
-            .Select(m => new LinkNode(m.Value, m.Groups[1].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyLinkNode = TitledLinkNode.Or(HiddenLinkNode).Or(AutoLinkNode); 
-
-        /* Text */
-
-        // Shrug is an exception and needs to be exempt from formatting
-        private static readonly Parser<Node> ShrugTextNode =
-            Parse.String("¯\\_(ツ)_/¯").Text().Select(s => new TextNode(s));
-
-        // Backslash escapes any following unicode surrogate pair
-        private static readonly Parser<Node> EscapedSurrogateTextNode =
-            from slash in Parse.Char('\\')
-            from high in Parse.AnyChar.Where(char.IsHighSurrogate)
-            from low in Parse.AnyChar
-            let lexeme = $"{slash}{high}{low}"
-            let text = $"{high}{low}"
-            select new TextNode(lexeme, text);
-
-        // Backslash escapes any following non-whitespace character except for digits and latin letters
-        private static readonly Parser<Node> EscapedTextNode =
-            Parse.RegexMatch("\\\\([^a-zA-Z0-9\\s])").Select(m => new TextNode(m.Value, m.Groups[1].Value));
-
-        // Combinator, order matters
-        private static readonly Parser<Node> AnyTextNode = ShrugTextNode.Or(EscapedSurrogateTextNode).Or(EscapedTextNode);
-
-        /* Aggregator and fallback */
-
-        // Any node recognized by above patterns
-        private static readonly Parser<Node> AnyRecognizedNode = AnyFormattedNode.Or(AnyCodeBlockNode)
-            .Or(AnyMentionNode).Or(AnyEmojiNode).Or(AnyLinkNode).Or(AnyTextNode);
-
-        // Any node not recognized by above patterns (treated as plain text)
-        private static readonly Parser<Node> FallbackNode =
-            Parse.AnyChar.Except(AnyRecognizedNode).AtLeastOnce().Text().Select(s => new TextNode(s));
-
-        // Any node
-        private static readonly Parser<Node> AnyNode = AnyRecognizedNode.Or(FallbackNode);
-
-        // Entry point
-        public static IReadOnlyList<Node> BuildTree(string input) => AnyNode.Many().Parse(input).ToArray();
-    }
-}
--- a/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/IMatcher.cs
@@ -0,0 +1,7 @@
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal interface IMatcher<T>
+    {
+        ParsedMatch<T> Match(string input, int startIndex, int length);
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/ParsedMatch.cs
@@ -0,0 +1,18 @@
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal partial class ParsedMatch<T>
+    {
+        public int StartIndex { get; }
+
+        public int Length { get; }
+
+        public T Value { get; }
+
+        public ParsedMatch(int startIndex, int length, T value)
+        {
+            StartIndex = startIndex;
+            Length = length;
+            Value = value;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/RegexMatcher.cs
@@ -0,0 +1,23 @@
+using System;
+using System.Text.RegularExpressions;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class RegexMatcher<T> : IMatcher<T>
+    {
+        private readonly Regex _regex;
+        private readonly Func<Match, T> _transform;
+
+        public RegexMatcher(Regex regex, Func<Match, T> transform)
+        {
+            _regex = regex;
+            _transform = transform;
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            var match = _regex.Match(input, startIndex, length);
+            return match.Success ? new ParsedMatch<T>(match.Index, match.Length, _transform(match)) : null;
+        }
+    }
+}
--- a/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs
+++ b/DiscordChatExporter.Core.Markdown/Internal/StringMatcher.cs
@@ -0,0 +1,29 @@
+using System;
+
+namespace DiscordChatExporter.Core.Markdown.Internal
+{
+    internal class StringMatcher<T> : IMatcher<T>
+    {
+        private readonly string _needle;
+        private readonly StringComparison _comparison;
+        private readonly Func<string, T> _transform;
+
+        public StringMatcher(string needle, StringComparison comparison, Func<string, T> transform)
+        {
+            _needle = needle;
+            _comparison = comparison;
+            _transform = transform;
+        }
+
+        public StringMatcher(string needle, Func<string, T> transform)
+            : this(needle, StringComparison.Ordinal, transform)
+        {
+        }
+
+        public ParsedMatch<T> Match(string input, int startIndex, int length)
+        {
+            var index = input.IndexOf(_needle, startIndex, length, _comparison);
+            return index >= 0 ? new ParsedMatch<T>(index, _needle.Length, _transform(_needle)) : null;
+        }
+    }
+}