[HTML] Recognize standard emoji by code and show emoji code in tooltips

Closes #549
Closes #599
This commit is contained in:
Tyrrrz
2021-06-19 19:49:53 +03:00
parent de57cd714d
commit 24a80f915f
14 changed files with 8960 additions and 77 deletions

View File

@@ -1,11 +1,20 @@
namespace DiscordChatExporter.Core.Markdown.Ast
using DiscordChatExporter.Core.Utils;
namespace DiscordChatExporter.Core.Markdown.Ast
{
internal class EmojiNode : MarkdownNode
{
// Only present on custom emoji
public string? Id { get; }
// Name of custom emoji (e.g. LUL) or actual representation of standard emoji (e.g. 🙂)
public string Name { get; }
// Name of custom emoji (e.g. LUL) or name of standard emoji (e.g. slight_smile)
public string Code => !string.IsNullOrWhiteSpace(Id)
? Name
: EmojiIndex.TryGetCode(Name) ?? Name;
public bool IsAnimated { get; }
public bool IsCustomEmoji => !string.IsNullOrWhiteSpace(Id);

View File

@@ -2,16 +2,6 @@
namespace DiscordChatExporter.Core.Markdown.Ast
{
internal enum TextFormatting
{
Bold,
Italic,
Underline,
Strikethrough,
Spoiler,
Quote
}
internal class FormattedNode : MarkdownNode
{
public TextFormatting Formatting { get; }

View File

@@ -1,13 +1,5 @@
namespace DiscordChatExporter.Core.Markdown.Ast
{
internal enum MentionType
{
Meta,
User,
Channel,
Role
}
internal class MentionNode : MarkdownNode
{
public string Id { get; }

View File

@@ -0,0 +1,10 @@
namespace DiscordChatExporter.Core.Markdown.Ast
{
internal enum MentionType
{
Meta,
User,
Channel,
Role
}
}

View File

@@ -0,0 +1,12 @@
namespace DiscordChatExporter.Core.Markdown.Ast
{
internal enum TextFormatting
{
Bold,
Italic,
Underline,
Strikethrough,
Spoiler,
Quote
}
}

View File

@@ -3,6 +3,7 @@ using System.Linq;
using System.Text.RegularExpressions;
using DiscordChatExporter.Core.Markdown.Ast;
using DiscordChatExporter.Core.Markdown.Matching;
using DiscordChatExporter.Core.Utils;
namespace DiscordChatExporter.Core.Markdown
{
@@ -103,7 +104,7 @@ namespace DiscordChatExporter.Core.Markdown
// There can be either one or two backticks, but equal number on both sides
private static readonly IMatcher<MarkdownNode> InlineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(`{1,2})([^`]+)\\1", DefaultRegexOptions | RegexOptions.Singleline),
m => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n'))
(_, m) => new InlineCodeBlockNode(m.Groups[2].Value.Trim('\r', '\n'))
);
// Capture language identifier and then any character until the earliest triple backtick
@@ -111,7 +112,7 @@ namespace DiscordChatExporter.Core.Markdown
// Blank lines at the beginning and end of content are trimmed
private static readonly IMatcher<MarkdownNode> MultiLineCodeBlockNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("```(?:(\\w*)\\n)?(.+?)```", DefaultRegexOptions | RegexOptions.Singleline),
m => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
(_, m) => new MultiLineCodeBlockNode(m.Groups[1].Value, m.Groups[2].Value.Trim('\r', '\n'))
);
/* Mentions */
@@ -131,19 +132,19 @@ namespace DiscordChatExporter.Core.Markdown
// Capture <@123456> or <@!123456>
private static readonly IMatcher<MarkdownNode> UserMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<@!?(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.User)
(_, m) => new MentionNode(m.Groups[1].Value, MentionType.User)
);
// Capture <#123456>
private static readonly IMatcher<MarkdownNode> ChannelMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<#(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.Channel)
(_, m) => new MentionNode(m.Groups[1].Value, MentionType.Channel)
);
// Capture <@&123456>
private static readonly IMatcher<MarkdownNode> RoleMentionNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<@&(\\d+)>", DefaultRegexOptions),
m => new MentionNode(m.Groups[1].Value, MentionType.Role)
(_, m) => new MentionNode(m.Groups[1].Value, MentionType.Role)
);
/* Emojis */
@@ -154,15 +155,26 @@ namespace DiscordChatExporter.Core.Markdown
// ... or digit followed by enclosing mark
// (this does not match all emojis in Discord but it's reasonably accurate enough)
private static readonly IMatcher<MarkdownNode> StandardEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u26FF]|\\p{Cs}{2}|\\d\\p{Me})",
DefaultRegexOptions),
m => new EmojiNode(m.Groups[1].Value)
new Regex("((?:[\\uD83C][\\uDDE6-\\uDDFF]){2}|[\\u2600-\\u26FF]|\\p{Cs}{2}|\\d\\p{Me})", DefaultRegexOptions),
(_, m) => new EmojiNode(m.Groups[1].Value)
);
// Capture :thinking: (but only for known emoji codes)
private static readonly IMatcher<MarkdownNode> CodedStandardEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex(":([\\w_]+):", DefaultRegexOptions),
(_, m) =>
{
var name = EmojiIndex.TryGetName(m.Groups[1].Value);
return name is not null
? new EmojiNode(name)
: null;
}
);
// Capture <:lul:123456> or <a:lul:123456>
private static readonly IMatcher<MarkdownNode> CustomEmojiNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<(a)?:(.+?):(\\d+?)>", DefaultRegexOptions),
m => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !string.IsNullOrWhiteSpace(m.Groups[1].Value))
(_, m) => new EmojiNode(m.Groups[3].Value, m.Groups[2].Value, !string.IsNullOrWhiteSpace(m.Groups[1].Value))
);
/* Links */
@@ -170,19 +182,19 @@ namespace DiscordChatExporter.Core.Markdown
// Capture [title](link)
private static readonly IMatcher<MarkdownNode> TitledLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\[(.+?)\\]\\((.+?)\\)", DefaultRegexOptions),
m => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)
(_, m) => new LinkNode(m.Groups[2].Value, m.Groups[1].Value)
);
// Capture any non-whitespace character after http:// or https:// until the last punctuation character or whitespace
private static readonly IMatcher<MarkdownNode> AutoLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(https?://\\S*[^\\.,:;\"\'\\s])", DefaultRegexOptions),
m => new LinkNode(m.Groups[1].Value)
(_, m) => new LinkNode(m.Groups[1].Value)
);
// Same as auto link but also surrounded by angular brackets
private static readonly IMatcher<MarkdownNode> HiddenLinkNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("<(https?://\\S*[^\\.,:;\"\'\\s])>", DefaultRegexOptions),
m => new LinkNode(m.Groups[1].Value)
(_, m) => new LinkNode(m.Groups[1].Value)
);
/* Text */
@@ -198,21 +210,21 @@ namespace DiscordChatExporter.Core.Markdown
// This escapes it from matching for emoji
private static readonly IMatcher<MarkdownNode> IgnoredEmojiTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("(\\u26A7|\\u2640|\\u2642|\\u2695|\\u267E|\\u00A9|\\u00AE|\\u2122)", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
(_, m) => new TextNode(m.Groups[1].Value)
);
// Capture any "symbol/other" character or surrogate pair preceded by a backslash
// This escapes it from matching for emoji
private static readonly IMatcher<MarkdownNode> EscapedSymbolTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\\\(\\p{So}|\\p{Cs}{2})", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
(_, m) => new TextNode(m.Groups[1].Value)
);
// Capture any non-whitespace, non latin alphanumeric character preceded by a backslash
// This escapes it from matching for formatting or other tokens
private static readonly IMatcher<MarkdownNode> EscapedCharacterTextNodeMatcher = new RegexMatcher<MarkdownNode>(
new Regex("\\\\([^a-zA-Z0-9\\s])", DefaultRegexOptions),
m => new TextNode(m.Groups[1].Value)
(_, m) => new TextNode(m.Groups[1].Value)
);
// Combine all matchers into one
@@ -255,7 +267,8 @@ namespace DiscordChatExporter.Core.Markdown
// Emoji
StandardEmojiNodeMatcher,
CustomEmojiNodeMatcher
CustomEmojiNodeMatcher,
CodedStandardEmojiNodeMatcher
);
// Minimal set of matchers for non-multimedia formats (e.g. plain text)

View File

@@ -6,19 +6,14 @@ namespace DiscordChatExporter.Core.Markdown.Matching
internal class RegexMatcher<T> : IMatcher<T>
{
private readonly Regex _regex;
private readonly Func<StringPart, Match, T> _transform;
private readonly Func<StringPart, Match, T?> _transform;
public RegexMatcher(Regex regex, Func<StringPart, Match, T> transform)
public RegexMatcher(Regex regex, Func<StringPart, Match, T?> transform)
{
_regex = regex;
_transform = transform;
}
public RegexMatcher(Regex regex, Func<Match, T> transform)
: this(regex, (p, m) => transform(m))
{
}
public ParsedMatch<T>? TryMatch(StringPart stringPart)
{
var match = _regex.Match(stringPart.Target, stringPart.StartIndex, stringPart.Length);
@@ -30,11 +25,15 @@ namespace DiscordChatExporter.Core.Markdown.Matching
// Which is super weird because regex.Match(string, int) takes the whole input in context.
// So in order to properly account for ^/$ regex tokens, we need to make sure that
// the expression also matches on the bigger part of the input.
if (!_regex.IsMatch(stringPart.Target.Substring(0, stringPart.EndIndex), stringPart.StartIndex))
if (!_regex.IsMatch(stringPart.Target[..stringPart.EndIndex], stringPart.StartIndex))
return null;
var stringPartMatch = stringPart.Slice(match.Index, match.Length);
return new ParsedMatch<T>(stringPartMatch, _transform(stringPartMatch, match));
var value = _transform(stringPartMatch, match);
return value is not null
? new ParsedMatch<T>(stringPartMatch, value)
: null;
}
}
}

View File

@@ -6,9 +6,9 @@ namespace DiscordChatExporter.Core.Markdown.Matching
{
private readonly string _needle;
private readonly StringComparison _comparison;
private readonly Func<StringPart, T> _transform;
private readonly Func<StringPart, T?> _transform;
public StringMatcher(string needle, StringComparison comparison, Func<StringPart, T> transform)
public StringMatcher(string needle, StringComparison comparison, Func<StringPart, T?> transform)
{
_needle = needle;
_comparison = comparison;
@@ -23,14 +23,15 @@ namespace DiscordChatExporter.Core.Markdown.Matching
public ParsedMatch<T>? TryMatch(StringPart stringPart)
{
var index = stringPart.Target.IndexOf(_needle, stringPart.StartIndex, stringPart.Length, _comparison);
if (index < 0)
return null;
if (index >= 0)
{
var stringPartMatch = stringPart.Slice(index, _needle.Length);
return new ParsedMatch<T>(stringPartMatch, _transform(stringPartMatch));
}
var stringPartMatch = stringPart.Slice(index, _needle.Length);
var value = _transform(stringPartMatch);
return null;
return value is not null
? new ParsedMatch<T>(stringPartMatch, value)
: null;
}
}
}

View File

@@ -10,14 +10,13 @@ namespace DiscordChatExporter.Core.Markdown.Matching
public int Length { get; }
public int EndIndex { get; }
public int EndIndex => StartIndex + Length;
public StringPart(string target, int startIndex, int length)
{
Target = target;
StartIndex = startIndex;
Length = length;
EndIndex = startIndex + length;
}
public StringPart(string target)