Replacing slugify with a new function

This commit is contained in:
KnugiHK
2025-05-17 16:04:31 +08:00
parent 96e483a6b0
commit 3f88f7fe08
4 changed files with 67 additions and 21 deletions

View File

@@ -13,7 +13,7 @@ from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device
from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty
from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata
from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -668,8 +668,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media):
# Copy media to separate folder if needed # Copy media to separate folder if needed
if separate_media: if separate_media:
chat_display_name = slugify(current_chat.name or message.sender chat_display_name = safe_name(current_chat.name or message.sender
or content["key_remote_jid"].split('@')[0], True) or content["key_remote_jid"].split('@')[0])
current_filename = file_path.split("/")[-1] current_filename = file_path.split("/")[-1]
new_folder = os.path.join(media_folder, "separated", chat_display_name) new_folder = os.path.join(media_folder, "separated", chat_display_name)
Path(new_folder).mkdir(parents=True, exist_ok=True) Path(new_folder).mkdir(parents=True, exist_ok=True)

View File

@@ -9,7 +9,7 @@ from mimetypes import MimeTypes
from markupsafe import escape as htmle from markupsafe import escape as htmle
from Whatsapp_Chat_Exporter.data_model import ChatStore, Message from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition
from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -402,8 +402,8 @@ def process_media_item(content, data, media_folder, mime, separate_media):
# Handle separate media option # Handle separate media option
if separate_media: if separate_media:
chat_display_name = slugify( chat_display_name = safe_name(
current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0])
current_filename = file_path.split("/")[-1] current_filename = file_path.split("/")[-1]
new_folder = os.path.join(media_folder, "separated", chat_display_name) new_folder = os.path.join(media_folder, "separated", chat_display_name)
Path(new_folder).mkdir(parents=True, exist_ok=True) Path(new_folder).mkdir(parents=True, exist_ok=True)

View File

@@ -5,6 +5,7 @@ import json
import os import os
import unicodedata import unicodedata
import re import re
import string
import math import math
import shutil import shutil
from bleach import clean as sanitize from bleach import clean as sanitize
@@ -12,7 +13,7 @@ from markupsafe import Markup
from datetime import datetime, timedelta from datetime import datetime, timedelta
from enum import IntEnum from enum import IntEnum
from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple, Union
try: try:
from enum import StrEnum, IntEnum from enum import StrEnum, IntEnum
except ImportError: except ImportError:
@@ -600,26 +601,28 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool
APPLE_TIME = 978307200 APPLE_TIME = 978307200
def slugify(value: str, allow_unicode: bool = False) -> str: def safe_name(text: Union[str|bytes]) -> str:
""" """
Convert text to ASCII-only slugs for URL-safe strings. Sanitize the input text and generates a safe file name.
Taken from https://github.com/django/django/blob/master/django/utils/text.py This function serves a similar purpose to slugify() from
Django previously used in this project, but is a clean-room
Reimplementation tailored for performance and a narrower
Use case for this project. Licensed under the same terms
As the project (MIT).
Args: Args:
value (str): The string to convert to a slug. text (str|bytes): The string to be sanitized.
allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False.
Returns: Returns:
str: The slugified string with only alphanumerics, underscores, or hyphens. str: The sanitized string with only alphanumerics, underscores, or hyphens.
""" """
value = str(value) if isinstance(text, bytes):
if allow_unicode: text = text.decode("utf-8", "ignore")
value = unicodedata.normalize('NFKC', value) elif not isinstance(text, str):
else: raise TypeError("value must be a string or bytes")
value = unicodedata.normalize('NFKD', value).encode( normalized_text = unicodedata.normalize("NFKC", text)
'ascii', 'ignore').decode('ascii') safe_chars = [char for char in normalized_text if char.isalnum() or char in "-_ ."]
value = re.sub(r'[^\w\s-]', '', value.lower()) return "-".join(''.join(safe_chars).split())
return re.sub(r'[-\s]+', '-', value).strip('-_')
class WhatsAppIdentifier(StrEnum): class WhatsAppIdentifier(StrEnum):

43
tests/test_utility.py Normal file
View File

@@ -0,0 +1,43 @@
import pytest
import random
import string
from Whatsapp_Chat_Exporter.utility import safe_name
def generate_random_string(length=50):
random.seed(10)
return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length))
# Test cases to validate the safe_name function
safe_name_test_cases = [
("This is a test string", "This-is-a-test-string"),
("This is a test string with special characters!@#$%^&*()", "This-is-a-test-string-with-special-characters"),
("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"),
("This is a test string with mixed case ThisIsATestString", "This-is-a-test-string-with-mixed-case-ThisIsATestString"),
("This is a test string with extra spaces ThisIsATestString", "This-is-a-test-string-with-extra-spaces-ThisIsATestString"),
("This is a test string with unicode characters äöüß", "This-is-a-test-string-with-unicode-characters-äöüß"),
("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is
(
f"This is a test string with long length {generate_random_string(1000)}",
f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}",
),
("", ""), # Empty string
(" ", ""), # String with only space
("---", "---"), # String with only hyphens
("___", "___"), # String with only underscores
("a" * 100, "a" * 100), # Long string with single character
("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen
("a_b_c_d_e", "a_b_c_d_e"), # String with underscore
("a b c d e", "a-b-c-d-e"), # String with spaces
("test.com/path/to/resource?param1=value1&param2=value2", "test.compathtoresourceparam1value1param2value2"), # Test with URL
("filename.txt", "filename.txt"), # Test with filename
("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters
("This is a test with комбинированные знаки ̆ example", "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode
]
@pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases)
def test_safe_name(input_text, expected_output):
result = safe_name(input_text)
assert result == expected_output