Replacing slugify with a new function

2026-05-06 12:16:58 +00:00 · 2025-05-17 16:04:31 +08:00
parent 96e483a6b0
commit 3f88f7fe08
4 changed files with 67 additions and 21 deletions
--- a/Whatsapp_Chat_Exporter/android_handler.py
+++ b/Whatsapp_Chat_Exporter/android_handler.py
@@ -13,7 +13,7 @@ from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
 from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device
 from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty
 from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata
-from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable
+from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable


 logger = logging.getLogger(__name__)
@@ -668,8 +668,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media):

        # Copy media to separate folder if needed
        if separate_media:
-            chat_display_name = slugify(current_chat.name or message.sender
-                                        or content["key_remote_jid"].split('@')[0], True)
+            chat_display_name = safe_name(current_chat.name or message.sender
+                                        or content["key_remote_jid"].split('@')[0])
            current_filename = file_path.split("/")[-1]
            new_folder = os.path.join(media_folder, "separated", chat_display_name)
            Path(new_folder).mkdir(parents=True, exist_ok=True)
--- a/Whatsapp_Chat_Exporter/ios_handler.py
+++ b/Whatsapp_Chat_Exporter/ios_handler.py
@@ -9,7 +9,7 @@ from mimetypes import MimeTypes
 from markupsafe import escape as htmle
 from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
 from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition
-from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device
+from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device


 logger = logging.getLogger(__name__)
@@ -402,8 +402,8 @@ def process_media_item(content, data, media_folder, mime, separate_media):

        # Handle separate media option
        if separate_media:
-            chat_display_name = slugify(
-                current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True)
+            chat_display_name = safe_name(
+                current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0])
            current_filename = file_path.split("/")[-1]
            new_folder = os.path.join(media_folder, "separated", chat_display_name)
            Path(new_folder).mkdir(parents=True, exist_ok=True)
--- a/Whatsapp_Chat_Exporter/utility.py
+++ b/Whatsapp_Chat_Exporter/utility.py
@@ -5,6 +5,7 @@ import json
 import os
 import unicodedata
 import re
+import string
 import math
 import shutil
 from bleach import clean as sanitize
@@ -12,7 +13,7 @@ from markupsafe import Markup
 from datetime import datetime, timedelta
 from enum import IntEnum
 from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 try:
    from enum import StrEnum, IntEnum
 except ImportError:
@@ -600,26 +601,28 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool
 APPLE_TIME = 978307200


-def slugify(value: str, allow_unicode: bool = False) -> str:
+def safe_name(text: Union[str|bytes]) -> str:
    """
-    Convert text to ASCII-only slugs for URL-safe strings.
-    Taken from https://github.com/django/django/blob/master/django/utils/text.py
+    Sanitize the input text and generates a safe file name.
+    This function serves a similar purpose to slugify() from
+    Django previously used in this project, but is a clean-room
+    Reimplementation tailored for performance and a narrower
+    Use case for this project. Licensed under the same terms
+    As the project (MIT).

    Args:
-        value (str): The string to convert to a slug.
-        allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False.
+        text (str|bytes): The string to be sanitized.

    Returns:
-        str: The slugified string with only alphanumerics, underscores, or hyphens.
+        str: The sanitized string with only alphanumerics, underscores, or hyphens.
    """
-    value = str(value)
-    if allow_unicode:
-        value = unicodedata.normalize('NFKC', value)
-    else:
-        value = unicodedata.normalize('NFKD', value).encode(
-            'ascii', 'ignore').decode('ascii')
-    value = re.sub(r'[^\w\s-]', '', value.lower())
-    return re.sub(r'[-\s]+', '-', value).strip('-_')
+    if isinstance(text, bytes):
+        text = text.decode("utf-8", "ignore")
+    elif not isinstance(text, str):
+        raise TypeError("value must be a string or bytes")
+    normalized_text = unicodedata.normalize("NFKC", text)
+    safe_chars = [char for char in normalized_text if char.isalnum() or char in "-_ ."]
+    return "-".join(''.join(safe_chars).split())


 class WhatsAppIdentifier(StrEnum):
--- a/tests/test_utility.py
+++ b/tests/test_utility.py
@@ -0,0 +1,43 @@
+import pytest
+import random
+import string
+
+from Whatsapp_Chat_Exporter.utility import safe_name
+
+def generate_random_string(length=50):
+    random.seed(10)
+    return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length))
+
+
+# Test cases to validate the safe_name function
+safe_name_test_cases = [
+    ("This is a test string", "This-is-a-test-string"),
+    ("This is a test string with special characters!@#$%^&*()", "This-is-a-test-string-with-special-characters"),
+    ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"),
+    ("This is a test string with mixed case ThisIsATestString", "This-is-a-test-string-with-mixed-case-ThisIsATestString"),
+    ("This is a test string with extra spaces     ThisIsATestString", "This-is-a-test-string-with-extra-spaces-ThisIsATestString"),
+    ("This is a test string with unicode characters äöüß", "This-is-a-test-string-with-unicode-characters-äöüß"),
+    ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"),  # Chinese characters, should stay as is
+    (
+        f"This is a test string with long length {generate_random_string(1000)}",
+        f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}",
+    ),
+    ("", ""),  # Empty string
+    (" ", ""),  # String with only space
+    ("---", "---"),  # String with only hyphens
+    ("___", "___"),  # String with only underscores
+    ("a" * 100, "a" * 100),  # Long string with single character
+    ("a-b-c-d-e", "a-b-c-d-e"),  # String with hyphen
+    ("a_b_c_d_e", "a_b_c_d_e"),  # String with underscore
+    ("a b c d e", "a-b-c-d-e"),  # String with spaces
+    ("test.com/path/to/resource?param1=value1&param2=value2", "test.compathtoresourceparam1value1param2value2"),  # Test with URL
+    ("filename.txt", "filename.txt"),  # Test with filename
+    ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."),  # Greek characters
+    ("This is a test with комбинированные знаки ̆ example", "This-is-a-test-with-комбинированные-знаки-example")  # Mixed with unicode
+]
+
+
+@pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases)
+def test_safe_name(input_text, expected_output):
+    result = safe_name(input_text)
+    assert result == expected_output