From bd503a0c7f64e64b23dce02de39be83afeb4cb2c Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 27 Apr 2025 15:16:57 +0800 Subject: [PATCH 001/108] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7514020..fa2658d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "whatsapp-chat-exporter" version = "0.12.0" -description = "A Whatsapp database parser that provides history of your Whatsapp conversations in HTML and JSON. Android, iOS, iPadOS, Crypt12, Crypt14, Crypt15 supported." +description = "A cross-platform tool for parsing WhatsApp chat databases from Android and iOS/iPadOS backups. Supports Android .crypt12, .crypt14, .crypt15, and the latest database formats. Outputs chat history in readable HTML or structured JSON." readme = "README.md" authors = [ { name = "KnugiHK", email = "hello@knugi.com" } From 1207b1e0cc366451d1607cf00994bf895c3514f7 Mon Sep 17 00:00:00 2001 From: fschuh Date: Fri, 11 Apr 2025 00:15:07 -0400 Subject: [PATCH 002/108] Added support for incremental merging --- Whatsapp_Chat_Exporter/__main__.py | 32 ++++++++++++++++- Whatsapp_Chat_Exporter/data_model.py | 49 ++++++++++++++++++++++++- Whatsapp_Chat_Exporter/utility.py | 53 ++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 2 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index bf130be..cde7e1a 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -13,7 +13,7 @@ from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, check_update, DbType from Whatsapp_Chat_Exporter.utility import readable_to_bytes, sanitize_filename -from Whatsapp_Chat_Exporter.utility import import_from_json, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, bytes_to_readable from argparse import ArgumentParser, SUPPRESS from datetime import datetime from getpass import getpass @@ -206,6 +206,32 @@ def setup_argument_parser() -> ArgumentParser: "--default-country-code", dest="default_country_code", default=None, help="Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country" ) + + # Incremental merging + inc_merging_group = parser.add_argument_group('Incremental Merging') + inc_merging_group.add_argument( + "--incremental-merge", + dest="incremental_merge", + default=False, + action='store_true', + help=("Performs an incremental merge of two exports." + "Requires setting both --source-dir and --target-dir." + "The chats and media of the source directory will be merged into the target directory." + "No chats or media will be deleted from the target directory, only new chats and media will be added to it." + ) + ) + inc_merging_group.add_argument( + "--source-dir", + dest="source_dir", + default=None, + help="Sets the source directory. Used for performing incremental merges." + ) + inc_merging_group.add_argument( + "--target-dir", + dest="target_dir", + default=None, + help="Sets the target directory. Used for performing incremental merges." + ) # Miscellaneous misc_group = parser.add_argument_group('Miscellaneous') @@ -250,6 +276,8 @@ def validate_args(parser: ArgumentParser, args) -> None: parser.error("You can only use --import with -j and without --no-html, -a, -i, -e.") elif args.import_json and not os.path.isfile(args.json): parser.error("JSON file not found.") + if args.incremental_merge and args.source_dir is None or args.target_dir is None: + parser.error("You must specify both --source-dir and --target-dir for incremental merge.") if args.android and args.business: parser.error("WhatsApp Business is only available on iOS for now.") if "??" not in args.headline: @@ -672,6 +700,8 @@ def main(): args.whatsapp_theme, args.headline ) + elif args.incremental_merge: + incremental_merge(args.source_dir, args.target_dir, args.media) elif args.exported: # Process exported chat process_exported_chat(args, data) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index e84154d..09101b0 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -213,6 +213,18 @@ class ChatStore: 'status': self.status, 'messages': {id: msg.to_json() for id, msg in self._messages.items()} } + + @classmethod + def from_json(cls, data): + chat = cls(data.get("type"), data.get("name")) + chat.my_avatar = data.get("my_avatar") + chat.their_avatar = data.get("their_avatar") + chat.their_avatar_thumb = data.get("their_avatar_thumb") + chat.status = data.get("status") + for id, msg_data in data.get("messages", {}).items(): + message = Message.from_json(msg_data) + chat.add_message(id, message) + return chat def get_last_message(self) -> 'Message': """Get the most recent message in the chat.""" @@ -230,6 +242,20 @@ class ChatStore: """Get all message keys in the chat.""" return self._messages.keys() + def merge_with(self, other): + if not isinstance(other, ChatStore): + raise TypeError("Can only merge with another ChatStore object") + + # Update fields if they are not None in the other ChatStore + self.name = other.name or self.name + self.type = other.type or self.type + self.my_avatar = other.my_avatar or self.my_avatar + self.their_avatar = other.their_avatar or self.their_avatar + self.their_avatar_thumb = other.their_avatar_thumb or self.their_avatar_thumb + self.status = other.status or self.status + + # Merge messages + self.messages.update(other.messages) class Message: """ @@ -310,4 +336,25 @@ class Message: 'caption': self.caption, 'thumb': self.thumb, 'sticker': self.sticker - } \ No newline at end of file + } + + @classmethod + def from_json(cls, data): + message = cls( + data["from_me"], + data["timestamp"], + data["time"], + data["key_id"] + ) + message.media = data.get("media") + message.meta = data.get("meta") + message.data = data.get("data") + message.sender = data.get("sender") + message.safe = data.get("safe") + message.mime = data.get("mime") + message.reply = data.get("reply") + message.quoted_data = data.get("quoted_data") + message.caption = data.get("caption") + message.thumb = data.get("thumb") + message.sticker = data.get("sticker") + return message diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 6d9fd36..3f3f71d 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -10,6 +10,7 @@ from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum from Whatsapp_Chat_Exporter.data_model import ChatStore +import shutil from typing import Dict, List, Optional, Tuple try: from enum import StrEnum, IntEnum @@ -258,6 +259,58 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): print(f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") +def incremental_merge(source_dir: str, target_dir: str, media_dir: str): + json_files = [f for f in os.listdir(source_dir) if f.endswith('.json')] + print("JSON files found:", json_files) + + for json_file in json_files: + source_path = os.path.join(source_dir, json_file) + target_path = os.path.join(target_dir, json_file) + + if not os.path.exists(target_path): + print(f"Copying {json_file} to target directory...") + os.makedirs(target_dir, exist_ok=True) + with open(source_path, 'rb') as src, open(target_path, 'wb') as dst: + dst.write(src.read()) + else: + print(f"Merging {json_file} with existing file in target directory...") + with open(source_path, 'r') as src_file, open(target_path, 'r') as tgt_file: + source_data = json.load(src_file) + target_data = json.load(tgt_file) + + # Parse JSON into ChatStore objects using from_json() + source_chats = {jid: ChatStore.from_json(chat) for jid, chat in source_data.items()} + target_chats = {jid: ChatStore.from_json(chat) for jid, chat in target_data.items()} + + # Merge chats using merge_with() + for jid, chat in source_chats.items(): + if jid in target_chats: + target_chats[jid].merge_with(chat) + else: + target_chats[jid] = chat + + # Write merged data back to the target file + with open(target_path, 'w') as merged_file: + merged_data = {jid: chat.to_json() for jid, chat in target_chats.items()} + json.dump(merged_data, merged_file, indent=2) + + # Merge media directories + source_media_path = os.path.join(source_dir, media_dir) + target_media_path = os.path.join(target_dir, media_dir) + if os.path.exists(source_media_path): + for root, dirs, files in os.walk(source_media_path): + relative_path = os.path.relpath(root, source_media_path) + target_root = os.path.join(target_media_path, relative_path) + os.makedirs(target_root, exist_ok=True) + for file in files: + source_file = os.path.join(root, file) + target_file = os.path.join(target_root, file) + # we only copy if the file doesn't exist in the target or if the source is newer + if not os.path.exists(target_file) or os.path.getmtime(source_file) > os.path.getmtime(target_file): + print(f"Copying {source_file} to {target_file}...") + shutil.copy2(source_file, target_file) + + def sanitize_filename(file_name: str) -> str: """Sanitizes a filename by removing invalid and unsafe characters. From 7cf73291249245eccd6e8e4b53cc8998233674bf Mon Sep 17 00:00:00 2001 From: fschuh Date: Sun, 13 Apr 2025 02:02:16 -0400 Subject: [PATCH 003/108] Updated help description --- Whatsapp_Chat_Exporter/__main__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index cde7e1a..c8c2204 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -214,10 +214,11 @@ def setup_argument_parser() -> ArgumentParser: dest="incremental_merge", default=False, action='store_true', - help=("Performs an incremental merge of two exports." - "Requires setting both --source-dir and --target-dir." - "The chats and media of the source directory will be merged into the target directory." - "No chats or media will be deleted from the target directory, only new chats and media will be added to it." + help=("Performs an incremental merge of two exports. " + "Requires setting both --source-dir and --target-dir. " + "The chats (JSON files only) and media from the source directory will be merged into the target directory. " + "No chat messages or media will be deleted from the target directory; only new chat messages and media will be added to it. " + "This enables chat messages and media to be deleted from the device to free up space, while ensuring they are preserved in the exported backups." ) ) inc_merging_group.add_argument( From 0dda7b7bd981d62ab3b0822c11c4483a451e5505 Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 14 Apr 2025 00:17:18 -0400 Subject: [PATCH 004/108] Updated README.md with incremental merge help description --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 5730519..cc3084c 100644 --- a/README.md +++ b/README.md @@ -231,6 +231,19 @@ Contact Enrichment: Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country +Incremental Merging: + --incremental-merge Performs an incremental merge of two exports. Requires setting both --source- + dir and --target-dir. The chats (JSON files only) and media from the source + directory will be merged into the target directory. No chat messages or media + will be deleted from the target directory; only new chat messages and media + will be added to it. This enables chat messages and media to be deleted from + the device to free up space, while ensuring they are preserved in the exported + backups. + --source-dir SOURCE_DIR + Sets the source directory. Used for performing incremental merges. + --target-dir TARGET_DIR + Sets the target directory. Used for performing incremental merges. + Miscellaneous: -s, --showkey Show the HEX key used to decrypt the database --check-update Check for updates (require Internet access) From f460f764415ed113265198c8ba8ea87c81d5b963 Mon Sep 17 00:00:00 2001 From: fschuh Date: Fri, 25 Apr 2025 19:10:22 -0400 Subject: [PATCH 005/108] Fixed issue on command line args validation --- Whatsapp_Chat_Exporter/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index c8c2204..46b0734 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -277,7 +277,7 @@ def validate_args(parser: ArgumentParser, args) -> None: parser.error("You can only use --import with -j and without --no-html, -a, -i, -e.") elif args.import_json and not os.path.isfile(args.json): parser.error("JSON file not found.") - if args.incremental_merge and args.source_dir is None or args.target_dir is None: + if args.incremental_merge and (args.source_dir is None or args.target_dir is None): parser.error("You must specify both --source-dir and --target-dir for incremental merge.") if args.android and args.business: parser.error("WhatsApp Business is only available on iOS for now.") From 2f1536052677d8e460ca0184ff15608ccf0c82ed Mon Sep 17 00:00:00 2001 From: fschuh Date: Sun, 4 May 2025 15:58:02 -0400 Subject: [PATCH 006/108] Fixed remaining compatibility issues with latest code --- Whatsapp_Chat_Exporter/__main__.py | 27 ++++++++++++++------------- Whatsapp_Chat_Exporter/data_model.py | 12 +++++++----- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 46b0734..5d09643 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -701,8 +701,6 @@ def main(): args.whatsapp_theme, args.headline ) - elif args.incremental_merge: - incremental_merge(args.source_dir, args.target_dir, args.media) elif args.exported: # Process exported chat process_exported_chat(args, data) @@ -749,16 +747,19 @@ def main(): if args.wa is None: args.wa = "ContactsV2.sqlite" - # Process contacts - process_contacts(args, data, contact_store) - - # Process messages, media, and calls - process_messages(args, data) - - # Create output files - create_output_files(args, data, contact_store) - - # Handle media directory - handle_media_directory(args) + if args.incremental_merge: + incremental_merge(args.source_dir, args.target_dir, args.media) + else: + # Process contacts + process_contacts(args, data, contact_store) + + # Process messages, media, and calls + process_messages(args, data) + + # Create output files + create_output_files(args, data, contact_store) + + # Handle media directory + handle_media_directory(args) print("Everything is done!") \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 09101b0..80e1ea7 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -255,7 +255,7 @@ class ChatStore: self.status = other.status or self.status # Merge messages - self.messages.update(other.messages) + self._messages.update(other._messages) class Message: """ @@ -341,10 +341,12 @@ class Message: @classmethod def from_json(cls, data): message = cls( - data["from_me"], - data["timestamp"], - data["time"], - data["key_id"] + from_me = data["from_me"], + timestamp = data["timestamp"], + time = data["time"], + key_id = data["key_id"], + received_timestamp = data.get("received_timestamp"), + read_timestamp = data.get("read_timestamp") ) message.media = data.get("media") message.meta = data.get("meta") From fbffc16452185bdb8270071581fa34a415709af1 Mon Sep 17 00:00:00 2001 From: fschuh Date: Sun, 4 May 2025 15:58:53 -0400 Subject: [PATCH 007/108] Added call to main() if directly executing __main__.py file --- Whatsapp_Chat_Exporter/__main__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 5d09643..e693727 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -762,4 +762,8 @@ def main(): # Handle media directory handle_media_directory(args) - print("Everything is done!") \ No newline at end of file + print("Everything is done!") + + +if __name__ == "__main__": + main() \ No newline at end of file From 8d34300ea5fb3d6128fbc0bc4c4dce29e498ca19 Mon Sep 17 00:00:00 2001 From: fschuh Date: Sun, 4 May 2025 22:55:42 -0400 Subject: [PATCH 008/108] Merged JSON files are now only updated on disk if the contents have actually changed. --- Whatsapp_Chat_Exporter/utility.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 3f3f71d..02f000c 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -289,10 +289,16 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): else: target_chats[jid] = chat - # Write merged data back to the target file - with open(target_path, 'w') as merged_file: - merged_data = {jid: chat.to_json() for jid, chat in target_chats.items()} - json.dump(merged_data, merged_file, indent=2) + # Serialize merged data + merged_data = {jid: chat.to_json() for jid, chat in target_chats.items()} + + # Check if the merged data differs from the original target data + if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): + print(f"Changes detected in {json_file}, updating target file...") + with open(target_path, 'w') as merged_file: + json.dump(merged_data, merged_file, indent=2) + else: + print(f"No changes detected in {json_file}, skipping update.") # Merge media directories source_media_path = os.path.join(source_dir, media_dir) From fab9bc76490825fe59d48b7fddf7d9b30230741f Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 5 May 2025 00:37:01 -0400 Subject: [PATCH 009/108] Added unit tests --- pyproject.toml | 5 + tests/test_incremental_merge.py | 327 ++++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 tests/test_incremental_merge.py diff --git a/pyproject.toml b/pyproject.toml index 0027184..d31333e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,3 +60,8 @@ include = ["Whatsapp_Chat_Exporter"] [tool.setuptools.package-data] Whatsapp_Chat_Exporter = ["*.html"] + +[dependency-groups] +dev = [ + "pytest>=8.3.5", +] diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py new file mode 100644 index 0000000..730350e --- /dev/null +++ b/tests/test_incremental_merge.py @@ -0,0 +1,327 @@ +import os +import json +import pytest +from unittest.mock import patch, mock_open, call, MagicMock +from Whatsapp_Chat_Exporter.utility import incremental_merge +from Whatsapp_Chat_Exporter.data_model import ChatStore + +# Test data setup +chat_data_1 = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", + "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "their_avatar_thumb": None, + "status": None, + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "safe": False, + "sticker": False + }, + "24691": { # This message only exists in target + "from_me": False, + "timestamp": 1463926641.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B8E", + "meta": False, + "data": "Great to see you", + "safe": False, + "sticker": False + } + } + } +} + +chat_data_2 = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", + "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "their_avatar_thumb": None, + "status": None, + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "safe": False, + "sticker": False + }, + "24692": { # This message only exists in source + "from_me": False, + "timestamp": 1463926642.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B9E", + "meta": False, + "data": "Hi there!", + "safe": False, + "sticker": False + }, + } + } +} + +# Expected merged data - should contain all messages with all fields initialized as they would be by Message class +chat_data_merged = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", + "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "their_avatar_thumb": None, + "status": None, + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False + }, + "24691": { + "from_me": False, + "timestamp": 1463926641.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B8E", + "meta": False, + "data": "Great to see you", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False + }, + "24692": { + "from_me": False, + "timestamp": 1463926642.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B9E", + "meta": False, + "data": "Hi there!", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False + }, + } + } +} + + +@pytest.fixture +def mock_filesystem(): + with ( + patch("os.path.exists") as mock_exists, + patch("os.makedirs") as mock_makedirs, + patch("os.path.getmtime") as mock_getmtime, + patch("os.listdir") as mock_listdir, + patch("os.walk") as mock_walk, + patch("shutil.copy2") as mock_copy2, + ): + yield { + "exists": mock_exists, + "makedirs": mock_makedirs, + "getmtime": mock_getmtime, + "listdir": mock_listdir, + "walk": mock_walk, + "copy2": mock_copy2, + } + + +def test_incremental_merge_new_file(mock_filesystem): + """Test merging when target file doesn't exist""" + source_dir = "/source" + target_dir = "/target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: x == "/source" + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Mock file operations + mock_file_content = { + "/source/chat.json": json.dumps(chat_data_1), + } + + with patch("builtins.open", mock_open()) as mock_file: + + def mock_file_read(filename, mode="r"): + content = mock_file_content.get(filename) + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir) + + # Verify the operations + mock_filesystem["makedirs"].assert_called_once_with(target_dir, exist_ok=True) + mock_file.assert_any_call("/source/chat.json", "rb") + mock_file.assert_any_call("/target/chat.json", "wb") + + +def test_incremental_merge_existing_file_with_changes(mock_filesystem): + """Test merging when target file exists and has changes""" + source_dir = "/source" + target_dir = "/target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Mock file operations + mock_file_content = { + "/source/chat.json": json.dumps(chat_data_2), + "/target/chat.json": json.dumps(chat_data_1), + } + + written_chunks = [] + + def mock_file_write(data): + written_chunks.append(data) + + mock_write = MagicMock(side_effect=mock_file_write) + + with patch("builtins.open", mock_open()) as mock_file: + def mock_file_read(filename, mode="r"): + content = mock_file_content.get(filename) + file_mock = mock_open(read_data=content).return_value + if mode == 'w': + file_mock.write.side_effect = mock_write + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir) + + # Verify file operations - both files opened in text mode when target exists + mock_file.assert_any_call("/source/chat.json", "r") + mock_file.assert_any_call("/target/chat.json", "r") + mock_file.assert_any_call("/target/chat.json", "w") + + # Verify write was called + assert mock_write.called, "Write method was never called" + + # Combine chunks and parse JSON + written_data = json.loads(''.join(written_chunks)) + + # Verify the merged data is correct + assert written_data is not None, "No data was written" + assert written_data == chat_data_merged, "Merged data does not match expected result" + + # Verify specific message retention + messages = written_data["12345678@s.whatsapp.net"]["messages"] + assert "24690" in messages, "Common message should be present" + assert "24691" in messages, "Target-only message should be preserved" + assert "24692" in messages, "Source-only message should be added" + assert len(messages) == 3, "Should have exactly 3 messages" + + +def test_incremental_merge_existing_file_no_changes(mock_filesystem): + """Test merging when target file exists but has no changes""" + source_dir = "/source" + target_dir = "/target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Mock file operations + mock_file_content = { + "/source/chat.json": json.dumps(chat_data_1), + "/target/chat.json": json.dumps(chat_data_1), + } + + with patch("builtins.open", mock_open()) as mock_file: + + def mock_file_read(filename, mode="r"): + content = mock_file_content.get(filename) + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir) + + # Verify no write operations occurred on target file + write_calls = [call for call in mock_file.mock_calls if call[0] == "().write"] + assert len(write_calls) == 0 + + +def test_incremental_merge_media_copy(mock_filesystem): + """Test media file copying during merge""" + source_dir = "/source" + target_dir = "/target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + mock_filesystem["walk"].return_value = [ + ("/source/media", ["subfolder"], ["file1.jpg"]), + ("/source/media/subfolder", [], ["file2.jpg"]), + ] + mock_filesystem["getmtime"].side_effect = lambda x: 1000 if "source" in x else 500 + + # Mock file operations + mock_file_content = { + "/source/chat.json": json.dumps(chat_data_1), + "/target/chat.json": json.dumps(chat_data_1), + } + + with patch("builtins.open", mock_open()) as mock_file: + + def mock_file_read(filename, mode="r"): + content = mock_file_content.get(filename) + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir) + + # Verify media file operations + assert ( + mock_filesystem["makedirs"].call_count >= 2 + ) # At least target dir and media dir + assert mock_filesystem["copy2"].call_count == 2 # Two media files copied From 1f9cbc3ad23aa8e80087ea2a500af3014378f757 Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 5 May 2025 00:39:13 -0400 Subject: [PATCH 010/108] Updated .gitignore with some additional dev folders --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5831f34..c427c08 100644 --- a/.gitignore +++ b/.gitignore @@ -138,7 +138,9 @@ __main__ # Dev time intermidiates & temp files result/ +output/ WhatsApp/ +AppDomainGroup-group.net.whatsapp.WhatsApp.shared/ /*.db /*.db-* /myout From d0fc620ba6997474925660b3a5b60a17b320f479 Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 5 May 2025 00:41:10 -0400 Subject: [PATCH 011/108] Added print statement with merging media folder names --- Whatsapp_Chat_Exporter/utility.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 02f000c..69f2aa2 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -303,6 +303,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): # Merge media directories source_media_path = os.path.join(source_dir, media_dir) target_media_path = os.path.join(target_dir, media_dir) + print(f"Merging media directories. Source dir: {source_media_path}, target dir: {target_media_path}") if os.path.exists(source_media_path): for root, dirs, files in os.walk(source_media_path): relative_path = os.path.relpath(root, source_media_path) From 534aea924d917d5d6fad153b74af83cc668e226b Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 15:20:14 +0800 Subject: [PATCH 012/108] Add docs --- Whatsapp_Chat_Exporter/data_model.py | 13 ++++++++++--- Whatsapp_Chat_Exporter/utility.py | 11 +++++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 80e1ea7..9a7a713 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -215,7 +215,8 @@ class ChatStore: } @classmethod - def from_json(cls, data): + def from_json(cls, data: Dict) -> 'ChatStore': + """Create a chat store from JSON data.""" chat = cls(data.get("type"), data.get("name")) chat.my_avatar = data.get("my_avatar") chat.their_avatar = data.get("their_avatar") @@ -242,7 +243,13 @@ class ChatStore: """Get all message keys in the chat.""" return self._messages.keys() - def merge_with(self, other): + def merge_with(self, other: 'ChatStore'): + """Merge another ChatStore into this one. + + Args: + other (ChatStore): The ChatStore to merge with + + """ if not isinstance(other, ChatStore): raise TypeError("Can only merge with another ChatStore object") @@ -339,7 +346,7 @@ class Message: } @classmethod - def from_json(cls, data): + def from_json(cls, data: Dict) -> 'Message': message = cls( from_me = data["from_me"], timestamp = data["timestamp"], diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 69f2aa2..b0ee613 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -5,18 +5,18 @@ import os import unicodedata import re import math +import shutil from bleach import clean as sanitize from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum from Whatsapp_Chat_Exporter.data_model import ChatStore -import shutil from typing import Dict, List, Optional, Tuple try: from enum import StrEnum, IntEnum except ImportError: # < Python 3.11 - # This should be removed when the support for Python 3.10 ends. + # This should be removed when the support for Python 3.10 ends. (31 Oct 2026) from enum import Enum class StrEnum(str, Enum): pass @@ -260,6 +260,13 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): def incremental_merge(source_dir: str, target_dir: str, media_dir: str): + """Merges JSON files from the source directory into the target directory. + + Args: + source_dir (str): The path to the source directory containing JSON files. + target_dir (str): The path to the target directory to merge into. + media_dir (str): The path to the media directory. + """ json_files = [f for f in os.listdir(source_dir) if f.endswith('.json')] print("JSON files found:", json_files) From ad267a7226a7dc2c0765e7b3ba366e3ef3db212a Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 15:20:46 +0800 Subject: [PATCH 013/108] Quote all paths in output messages --- Whatsapp_Chat_Exporter/utility.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index b0ee613..e78c339 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -275,12 +275,12 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): target_path = os.path.join(target_dir, json_file) if not os.path.exists(target_path): - print(f"Copying {json_file} to target directory...") + print(f"Copying '{json_file}' to target directory...") os.makedirs(target_dir, exist_ok=True) with open(source_path, 'rb') as src, open(target_path, 'wb') as dst: dst.write(src.read()) else: - print(f"Merging {json_file} with existing file in target directory...") + print(f"Merging '{json_file}' with existing file in target directory...") with open(source_path, 'r') as src_file, open(target_path, 'r') as tgt_file: source_data = json.load(src_file) target_data = json.load(tgt_file) @@ -301,16 +301,16 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): # Check if the merged data differs from the original target data if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): - print(f"Changes detected in {json_file}, updating target file...") + print(f"Changes detected in '{json_file}', updating target file...") with open(target_path, 'w') as merged_file: json.dump(merged_data, merged_file, indent=2) else: - print(f"No changes detected in {json_file}, skipping update.") + print(f"No changes detected in '{json_file}', skipping update.") # Merge media directories source_media_path = os.path.join(source_dir, media_dir) target_media_path = os.path.join(target_dir, media_dir) - print(f"Merging media directories. Source dir: {source_media_path}, target dir: {target_media_path}") + print(f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") if os.path.exists(source_media_path): for root, dirs, files in os.walk(source_media_path): relative_path = os.path.relpath(root, source_media_path) @@ -321,7 +321,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): target_file = os.path.join(target_root, file) # we only copy if the file doesn't exist in the target or if the source is newer if not os.path.exists(target_file) or os.path.getmtime(source_file) > os.path.getmtime(target_file): - print(f"Copying {source_file} to {target_file}...") + print(f"Copying '{source_file}' to '{target_file}'...") shutil.copy2(source_file, target_file) From b8652fcb96d975b66a2f3ec89e8fc4255dfd7271 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 15:22:00 +0800 Subject: [PATCH 014/108] Throwaway variable --- Whatsapp_Chat_Exporter/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index e78c339..3e5e70f 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -312,7 +312,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): target_media_path = os.path.join(target_dir, media_dir) print(f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") if os.path.exists(source_media_path): - for root, dirs, files in os.walk(source_media_path): + for root, _, files in os.walk(source_media_path): relative_path = os.path.relpath(root, source_media_path) target_root = os.path.join(target_media_path, relative_path) os.makedirs(target_root, exist_ok=True) From 5a9944d14b754e4c1c95c4d539e6c83a7d030105 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:09:53 +0800 Subject: [PATCH 015/108] Respects users' choices on the output JSON --- Whatsapp_Chat_Exporter/__main__.py | 13 ++++++++++--- Whatsapp_Chat_Exporter/utility.py | 9 +++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index e693727..a62d317 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -269,7 +269,7 @@ def validate_args(parser: ArgumentParser, args) -> None: # Basic validation checks if args.android and args.ios and args.exported and args.import_json: parser.error("You must define only one device type.") - if not args.android and not args.ios and not args.exported and not args.import_json: + if not args.android and not args.ios and not args.exported and not args.import_json and not args.incremental_merge: parser.error("You must define the device type.") if args.no_html and not args.json and not args.text_format: parser.error("You must either specify a JSON output file, text file output directory or enable HTML output.") @@ -748,7 +748,14 @@ def main(): args.wa = "ContactsV2.sqlite" if args.incremental_merge: - incremental_merge(args.source_dir, args.target_dir, args.media) + incremental_merge( + args.source_dir, + args.target_dir, + args.media, + args.pretty_print_json, + args.avoid_encoding_json + ) + print("Incremental merge completed successfully.") else: # Process contacts process_contacts(args, data, contact_store) @@ -766,4 +773,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 3e5e70f..b3303bd 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -259,7 +259,7 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): print(f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") -def incremental_merge(source_dir: str, target_dir: str, media_dir: str): +def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): """Merges JSON files from the source directory into the target directory. Args: @@ -303,7 +303,12 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str): if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): print(f"Changes detected in '{json_file}', updating target file...") with open(target_path, 'w') as merged_file: - json.dump(merged_data, merged_file, indent=2) + json.dump( + merged_data, + merged_file, + indent=pretty_print_json, + ensure_ascii=not avoid_encoding_json, + ) else: print(f"No changes detected in '{json_file}', skipping update.") From eded9a140fcd14825cfb3af060426763f603876e Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:11:10 +0800 Subject: [PATCH 016/108] Add new attributes to JSON --- Whatsapp_Chat_Exporter/data_model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 9a7a713..26e12cb 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -211,6 +211,7 @@ class ChatStore: 'their_avatar': self.their_avatar, 'their_avatar_thumb': self.their_avatar_thumb, 'status': self.status, + 'media_base': self.media_base, 'messages': {id: msg.to_json() for id, msg in self._messages.items()} } @@ -342,7 +343,10 @@ class Message: 'quoted_data': self.quoted_data, 'caption': self.caption, 'thumb': self.thumb, - 'sticker': self.sticker + 'sticker': self.sticker, + 'message_type': self.message_type, + 'received_timestamp': self.received_timestamp, + 'read_timestamp': self.read_timestamp } @classmethod From 9d76cf60afe3f53adceb439421d530e8f76c8c7e Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:12:13 +0800 Subject: [PATCH 017/108] Attach media_base from JSON --- Whatsapp_Chat_Exporter/data_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 26e12cb..98d8f65 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -223,6 +223,7 @@ class ChatStore: chat.their_avatar = data.get("their_avatar") chat.their_avatar_thumb = data.get("their_avatar_thumb") chat.status = data.get("status") + chat.media_base = data.get("media_base") for id, msg_data in data.get("messages", {}).items(): message = Message.from_json(msg_data) chat.add_message(id, message) From 3e3aeae7ada2a7ec504836e5d0388c5ca6967f84 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:12:57 +0800 Subject: [PATCH 018/108] key_id can also be a string --- Whatsapp_Chat_Exporter/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 98d8f65..61d1440 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -276,7 +276,7 @@ class Message: from_me: Union[bool, int], timestamp: int, time: Union[int, float, str], - key_id: int, + key_id: Union[int, str], received_timestamp: int, read_timestamp: int, timezone_offset: int = 0, From 352be849a764c243afa18139be0d6e7f917ac35c Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:13:17 +0800 Subject: [PATCH 019/108] Bug fix on messages with timestamp being '0' --- Whatsapp_Chat_Exporter/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 61d1440..8258d18 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -27,7 +27,7 @@ class Timing: Returns: Optional[str]: Formatted timestamp string, or None if timestamp is None """ - if timestamp: + if timestamp is not None: timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp return datetime.fromtimestamp(timestamp, TimeZone(self.timezone_offset)).strftime(format) return None From 75c2db6d5c83ee0aefd11ca9c252d85246c83af6 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:13:48 +0800 Subject: [PATCH 020/108] Accept both raw timestamp and formatted time string --- Whatsapp_Chat_Exporter/data_model.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 8258d18..e456a54 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -317,9 +317,19 @@ class Message: self.safe = False self.mime = None self.message_type = message_type, - self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") - self.read_timestamp = timing.format_timestamp(read_timestamp, "%Y/%m/%d %H:%M") - + if isinstance(received_timestamp, (int, float)): + self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") + elif isinstance(received_timestamp, str): + self.received_timestamp = received_timestamp + else: + self.received_timestamp = None + if isinstance(read_timestamp, (int, float)): + self.read_timestamp = timing.format_timestamp(read_timestamp, "%Y/%m/%d %H:%M") + elif isinstance(read_timestamp, str): + self.read_timestamp = read_timestamp + else: + self.read_timestamp = None + # Extra attributes self.reply = None self.quoted_data = None From ed2ec7cb9e7cbfe4e2fac914dc48d1f205cfa558 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:14:05 +0800 Subject: [PATCH 021/108] Exit if no json is found --- Whatsapp_Chat_Exporter/utility.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index b3303bd..cbc67ae 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -268,6 +268,10 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p media_dir (str): The path to the media directory. """ json_files = [f for f in os.listdir(source_dir) if f.endswith('.json')] + if not json_files: + print("No JSON files found in the source directory.") + return + print("JSON files found:", json_files) for json_file in json_files: From 60790d89e35e5f189cb67a7c5ab03e14c8ba352e Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 16:15:51 +0800 Subject: [PATCH 022/108] Remove args.incremental_merge from device type check --- Whatsapp_Chat_Exporter/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index a62d317..7ca07ca 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -269,7 +269,7 @@ def validate_args(parser: ArgumentParser, args) -> None: # Basic validation checks if args.android and args.ios and args.exported and args.import_json: parser.error("You must define only one device type.") - if not args.android and not args.ios and not args.exported and not args.import_json and not args.incremental_merge: + if not args.android and not args.ios and not args.exported and not args.import_json: parser.error("You must define the device type.") if args.no_html and not args.json and not args.text_format: parser.error("You must either specify a JSON output file, text file output directory or enable HTML output.") From 4ca56b1c5c5d1c83c5277e86f473533fdb5a06d5 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 17:08:35 +0800 Subject: [PATCH 023/108] Bug fix on wrong type of self.message_type --- Whatsapp_Chat_Exporter/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index e456a54..90787f4 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -316,7 +316,7 @@ class Message: self.sender = None self.safe = False self.mime = None - self.message_type = message_type, + self.message_type = message_type if isinstance(received_timestamp, (int, float)): self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") elif isinstance(received_timestamp, str): From 4e1d994aa5ac21f5f8beddd1a254e0bff31250d2 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 17:10:57 +0800 Subject: [PATCH 024/108] Add message_type when importing json --- Whatsapp_Chat_Exporter/data_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 90787f4..3dc6b24 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -367,6 +367,7 @@ class Message: timestamp = data["timestamp"], time = data["time"], key_id = data["key_id"], + message_type = data.get("message_type"), received_timestamp = data.get("received_timestamp"), read_timestamp = data.get("read_timestamp") ) From 3220ed2d3ff5c80a51d75b2fdeffe8dab45f6519 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 17:11:55 +0800 Subject: [PATCH 025/108] Update testing data --- tests/test_incremental_merge.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index 730350e..abd1dc2 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -85,6 +85,7 @@ chat_data_merged = { "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", "their_avatar_thumb": None, "status": None, + "media_base": None, "messages": { "24690": { "from_me": True, @@ -101,7 +102,10 @@ chat_data_merged = { "quoted_data": None, "caption": None, "thumb": None, - "sticker": False + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None }, "24691": { "from_me": False, @@ -118,7 +122,10 @@ chat_data_merged = { "quoted_data": None, "caption": None, "thumb": None, - "sticker": False + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None }, "24692": { "from_me": False, @@ -135,7 +142,10 @@ chat_data_merged = { "quoted_data": None, "caption": None, "thumb": None, - "sticker": False + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None }, } } @@ -187,10 +197,9 @@ def test_incremental_merge_new_file(mock_filesystem): mock_file.side_effect = mock_file_read # Run the function - incremental_merge(source_dir, target_dir, media_dir) + incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify the operations - mock_filesystem["makedirs"].assert_called_once_with(target_dir, exist_ok=True) mock_file.assert_any_call("/source/chat.json", "rb") mock_file.assert_any_call("/target/chat.json", "wb") @@ -229,7 +238,7 @@ def test_incremental_merge_existing_file_with_changes(mock_filesystem): mock_file.side_effect = mock_file_read # Run the function - incremental_merge(source_dir, target_dir, media_dir) + incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify file operations - both files opened in text mode when target exists mock_file.assert_any_call("/source/chat.json", "r") @@ -280,7 +289,7 @@ def test_incremental_merge_existing_file_no_changes(mock_filesystem): mock_file.side_effect = mock_file_read # Run the function - incremental_merge(source_dir, target_dir, media_dir) + incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify no write operations occurred on target file write_calls = [call for call in mock_file.mock_calls if call[0] == "().write"] @@ -318,7 +327,7 @@ def test_incremental_merge_media_copy(mock_filesystem): mock_file.side_effect = mock_file_read # Run the function - incremental_merge(source_dir, target_dir, media_dir) + incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify media file operations assert ( From a58dd78be8a101ad90657985ef9ddfc8c773c3f8 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Mon, 5 May 2025 17:13:43 +0800 Subject: [PATCH 026/108] PEP8 --- Whatsapp_Chat_Exporter/__main__.py | 203 +++++++++++++++------------ Whatsapp_Chat_Exporter/data_model.py | 43 +++--- Whatsapp_Chat_Exporter/utility.py | 117 +++++++++------ tests/test_incremental_merge.py | 32 +++-- 4 files changed, 224 insertions(+), 171 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 7ca07ca..51bbabf 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -34,12 +34,12 @@ def setup_argument_parser() -> ArgumentParser: """Set up and return the argument parser with all options.""" parser = ArgumentParser( description='A customizable Android and iOS/iPadOS WhatsApp database parser that ' - 'will give you the history of your WhatsApp conversations in HTML ' - 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', + 'will give you the history of your WhatsApp conversations in HTML ' + 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', epilog=f'WhatsApp Chat Exporter: {importlib.metadata.version("whatsapp_chat_exporter")} Licensed with MIT. See ' - 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' + 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' ) - + # Device type arguments device_group = parser.add_argument_group('Device Type') device_group.add_argument( @@ -54,7 +54,7 @@ def setup_argument_parser() -> ArgumentParser: "-e", "--exported", dest="exported", default=None, help="Define the target as exported chat file and specify the path to the file" ) - + # Input file paths input_group = parser.add_argument_group('Input Files') input_group.add_argument( @@ -86,7 +86,7 @@ def setup_argument_parser() -> ArgumentParser: "--wab", "--wa-backup", dest="wab", default=None, help="Path to contact database in crypt15 format" ) - + # Output options output_group = parser.add_argument_group('Output Options') output_group.add_argument( @@ -109,7 +109,7 @@ def setup_argument_parser() -> ArgumentParser: "--size", "--output-size", "--split", dest="size", nargs='?', const=0, default=None, help="Maximum (rough) size of a single output file in bytes, 0 for auto" ) - + # JSON formatting options json_group = parser.add_argument_group('JSON Options') json_group.add_argument( @@ -128,7 +128,7 @@ def setup_argument_parser() -> ArgumentParser: "--import", dest="import_json", default=False, action='store_true', help="Import JSON file and convert to HTML output" ) - + # HTML options html_group = parser.add_argument_group('HTML Options') html_group.add_argument( @@ -155,7 +155,7 @@ def setup_argument_parser() -> ArgumentParser: "--headline", dest="headline", default="Chat history with ??", help="The custom headline for the HTML output. Use '??' as a placeholder for the chat name" ) - + # Media handling media_group = parser.add_argument_group('Media Handling') media_group.add_argument( @@ -166,7 +166,7 @@ def setup_argument_parser() -> ArgumentParser: "--create-separated-media", dest="separate_media", default=False, action='store_true', help="Create a copy of the media seperated per chat in /separated/ directory" ) - + # Filtering options filter_group = parser.add_argument_group('Filtering Options') filter_group.add_argument( @@ -195,7 +195,7 @@ def setup_argument_parser() -> ArgumentParser: "Setting this flag will cause the exporter to render those. " "This is useful if chat(s) are missing from the output") ) - + # Contact enrichment contact_group = parser.add_argument_group('Contact Enrichment') contact_group.add_argument( @@ -219,7 +219,7 @@ def setup_argument_parser() -> ArgumentParser: "The chats (JSON files only) and media from the source directory will be merged into the target directory. " "No chat messages or media will be deleted from the target directory; only new chat messages and media will be added to it. " "This enables chat messages and media to be deleted from the device to free up space, while ensuring they are preserved in the exported backups." - ) + ) ) inc_merging_group.add_argument( "--source-dir", @@ -233,7 +233,7 @@ def setup_argument_parser() -> ArgumentParser: default=None, help="Sets the target directory. Used for performing incremental merges." ) - + # Miscellaneous misc_group = parser.add_argument_group('Miscellaneous') misc_group.add_argument( @@ -260,7 +260,7 @@ def setup_argument_parser() -> ArgumentParser: "--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int, help="Specify the maximum number of worker for bruteforce decryption." ) - + return parser @@ -272,52 +272,59 @@ def validate_args(parser: ArgumentParser, args) -> None: if not args.android and not args.ios and not args.exported and not args.import_json: parser.error("You must define the device type.") if args.no_html and not args.json and not args.text_format: - parser.error("You must either specify a JSON output file, text file output directory or enable HTML output.") + parser.error( + "You must either specify a JSON output file, text file output directory or enable HTML output.") if args.import_json and (args.android or args.ios or args.exported or args.no_html): - parser.error("You can only use --import with -j and without --no-html, -a, -i, -e.") + parser.error( + "You can only use --import with -j and without --no-html, -a, -i, -e.") elif args.import_json and not os.path.isfile(args.json): parser.error("JSON file not found.") if args.incremental_merge and (args.source_dir is None or args.target_dir is None): - parser.error("You must specify both --source-dir and --target-dir for incremental merge.") + parser.error( + "You must specify both --source-dir and --target-dir for incremental merge.") if args.android and args.business: parser.error("WhatsApp Business is only available on iOS for now.") if "??" not in args.headline: parser.error("--headline must contain '??' for replacement.") - + # JSON validation if args.json_per_chat and args.json and ( - (args.json.endswith(".json") and os.path.isfile(args.json)) or + (args.json.endswith(".json") and os.path.isfile(args.json)) or (not args.json.endswith(".json") and os.path.isfile(args.json)) ): - parser.error("When --per-chat is enabled, the destination of --json must be a directory.") - + parser.error( + "When --per-chat is enabled, the destination of --json must be a directory.") + # vCards validation if args.enrich_from_vcards is not None and args.default_country_code is None: - parser.error("When --enrich-from-vcards is provided, you must also set --default-country-code") - + parser.error( + "When --enrich-from-vcards is provided, you must also set --default-country-code") + # Size validation if args.size is not None and not isinstance(args.size, int) and not args.size.isnumeric(): try: args.size = readable_to_bytes(args.size) except ValueError: - parser.error("The value for --split must be ended in pure bytes or with a proper unit (e.g., 1048576 or 1MB)") - + parser.error( + "The value for --split must be ended in pure bytes or with a proper unit (e.g., 1048576 or 1MB)") + # Date filter validation and processing if args.filter_date is not None: process_date_filter(parser, args) - + # Crypt15 key validation if args.key is None and args.backup is not None and args.backup.endswith("crypt15"): args.key = getpass("Enter your encryption key: ") - + # Theme validation if args.whatsapp_theme: args.template = "whatsapp_new.html" - + # Chat filter validation if args.filter_chat_include is not None and args.filter_chat_exclude is not None: - parser.error("Chat inclusion and exclusion filters cannot be used together.") - + parser.error( + "Chat inclusion and exclusion filters cannot be used together.") + validate_chat_filters(parser, args.filter_chat_include) validate_chat_filters(parser, args.filter_chat_exclude) @@ -327,21 +334,24 @@ def validate_chat_filters(parser: ArgumentParser, chat_filter: Optional[List[str if chat_filter is not None: for chat in chat_filter: if not chat.isnumeric(): - parser.error("Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") + parser.error( + "Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") def process_date_filter(parser: ArgumentParser, args) -> None: """Process and validate date filter arguments.""" if " - " in args.filter_date: start, end = args.filter_date.split(" - ") - start = int(datetime.strptime(start, args.filter_date_format).timestamp()) + start = int(datetime.strptime( + start, args.filter_date_format).timestamp()) end = int(datetime.strptime(end, args.filter_date_format).timestamp()) - + if start < 1009843200 or end < 1009843200: parser.error("WhatsApp was first released in 2009...") if start > end: - parser.error("The start date cannot be a moment after the end date.") - + parser.error( + "The start date cannot be a moment after the end date.") + if args.android: args.filter_date = f"BETWEEN {start}000 AND {end}000" elif args.ios: @@ -353,13 +363,15 @@ def process_date_filter(parser: ArgumentParser, args) -> None: def process_single_date_filter(parser: ArgumentParser, args) -> None: """Process single date comparison filters.""" if len(args.filter_date) < 3: - parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") - - _timestamp = int(datetime.strptime(args.filter_date[2:], args.filter_date_format).timestamp()) - + parser.error( + "Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + + _timestamp = int(datetime.strptime( + args.filter_date[2:], args.filter_date_format).timestamp()) + if _timestamp < 1009843200: parser.error("WhatsApp was first released in 2009...") - + if args.filter_date[:2] == "> ": if args.android: args.filter_date = f">= {_timestamp}000" @@ -371,7 +383,8 @@ def process_single_date_filter(parser: ArgumentParser, args) -> None: elif args.ios: args.filter_date = f"<= {_timestamp - APPLE_TIME}" else: - parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + parser.error( + "Unsupported date format. See https://wts.knugi.dev/docs?dest=date") def setup_contact_store(args) -> Optional['ContactsFromVCards']: @@ -385,7 +398,8 @@ def setup_contact_store(args) -> Optional['ContactsFromVCards']: ) exit(1) contact_store = ContactsFromVCards() - contact_store.load_vcf_file(args.enrich_from_vcards, args.default_country_code) + contact_store.load_vcf_file( + args.enrich_from_vcards, args.default_country_code) return contact_store return None @@ -395,9 +409,9 @@ def decrypt_android_backup(args) -> int: if args.key is None or args.backup is None: print("You must specify the backup file with -b and a key with -k") return 1 - + print("Decryption key specified, decrypting WhatsApp backup...") - + # Determine crypt type if "crypt12" in args.backup: crypt = Crypt.CRYPT12 @@ -408,7 +422,7 @@ def decrypt_android_backup(args) -> int: else: print("Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.") return 1 - + # Get key keyfile_stream = False if not os.path.isfile(args.key) and all(char in string.hexdigits for char in args.key.replace(" ", "")): @@ -416,10 +430,10 @@ def decrypt_android_backup(args) -> int: else: key = open(args.key, "rb") keyfile_stream = True - + # Read backup db = open(args.backup, "rb").read() - + # Process WAB if provided error_wa = 0 if args.wab: @@ -436,7 +450,7 @@ def decrypt_android_backup(args) -> int: ) if isinstance(key, io.IOBase): key.seek(0) - + # Decrypt message database error_message = android_crypt.decrypt_backup( db, @@ -448,7 +462,7 @@ def decrypt_android_backup(args) -> int: keyfile_stream=keyfile_stream, max_worker=args.max_bruteforce_worker ) - + # Handle errors if error_wa != 0: return error_wa @@ -473,7 +487,7 @@ def handle_decrypt_error(error: int) -> None: def process_contacts(args, data: ChatCollection, contact_store=None) -> None: """Process contacts from the database.""" contact_db = args.wa if args.wa else "wa.db" if args.android else "ContactsV2.sqlite" - + if os.path.isfile(contact_db): with sqlite3.connect(contact_db) as db: db.row_factory = sqlite3.Row @@ -486,42 +500,42 @@ def process_contacts(args, data: ChatCollection, contact_store=None) -> None: def process_messages(args, data: ChatCollection) -> None: """Process messages, media and vcards from the database.""" msg_db = args.db if args.db else "msgstore.db" if args.android else args.identifiers.MESSAGE - + if not os.path.isfile(msg_db): print( "The message database does not exist. You may specify the path " "to database file with option -d or check your provided path." ) exit(6) - + filter_chat = (args.filter_chat_include, args.filter_chat_exclude) - + with sqlite3.connect(msg_db) as db: db.row_factory = sqlite3.Row - + # Process messages if args.android: message_handler = android_handler else: message_handler = ios_handler - + message_handler.messages( - db, data, args.media, args.timezone_offset, + db, data, args.media, args.timezone_offset, args.filter_date, filter_chat, args.filter_empty ) - + # Process media message_handler.media( - db, data, args.media, args.filter_date, + db, data, args.media, args.filter_date, filter_chat, args.filter_empty, args.separate_media ) - + # Process vcards message_handler.vcard( - db, data, args.media, args.filter_date, + db, data, args.media, args.filter_date, filter_chat, args.filter_empty ) - + # Process calls process_calls(args, db, data, filter_chat) @@ -540,9 +554,10 @@ def handle_media_directory(args) -> None: """Handle media directory copying or moving.""" if os.path.isdir(args.media): media_path = os.path.join(args.output, args.media) - + if os.path.isdir(media_path): - print("\nWhatsApp directory already exists in output directory. Skipping...", end="\n") + print( + "\nWhatsApp directory already exists in output directory. Skipping...", end="\n") else: if args.move_media: try: @@ -563,7 +578,7 @@ def create_output_files(args, data: ChatCollection, contact_store=None) -> None: # Enrich from vcards if available if contact_store and not contact_store.is_empty(): contact_store.enrich_from_vcards(data) - + android_handler.create_html( data, args.output, @@ -575,12 +590,12 @@ def create_output_files(args, data: ChatCollection, contact_store=None) -> None: args.whatsapp_theme, args.headline ) - + # Create text files if requested if args.text_format: print("Writing text file...") android_handler.create_txt(data, args.text_format) - + # Create JSON files if requested if args.json and not args.import_json: export_json(args, data, contact_store) @@ -591,11 +606,11 @@ def export_json(args, data: ChatCollection, contact_store=None) -> None: # Enrich from vcards if available if contact_store and not contact_store.is_empty(): contact_store.enrich_from_vcards(data) - + # Convert ChatStore objects to JSON if isinstance(data.get(next(iter(data), None)), ChatStore): data = {jik: chat.to_json() for jik, chat in data.items()} - + # Export as a single file or per chat if not args.json_per_chat: export_single_json(args, data) @@ -619,11 +634,11 @@ def export_multiple_json(args, data: Dict) -> None: """Export data to multiple JSON files, one per chat.""" # Adjust output path if needed json_path = args.json[:-5] if args.json.endswith(".json") else args.json - + # Create directory if it doesn't exist if not os.path.isdir(json_path): os.makedirs(json_path, exist_ok=True) - + # Export each chat total = len(data.keys()) for index, jik in enumerate(data.keys()): @@ -631,11 +646,11 @@ def export_multiple_json(args, data: Dict) -> None: contact = data[jik]["name"].replace('/', '') else: contact = jik.replace('+', '') - + with open(f"{json_path}/{sanitize_filename(contact)}.json", "w") as f: file_content = json.dumps( - {jik: data[jik]}, - ensure_ascii=not args.avoid_encoding_json, + {jik: data[jik]}, + ensure_ascii=not args.avoid_encoding_json, indent=args.pretty_print_json ) f.write(file_content) @@ -646,7 +661,7 @@ def export_multiple_json(args, data: Dict) -> None: def process_exported_chat(args, data: ChatCollection) -> None: """Process an exported chat file.""" exported_handler.messages(args.exported, data, args.assume_first_as_me) - + if not args.no_html: android_handler.create_html( data, @@ -659,7 +674,7 @@ def process_exported_chat(args, data: ChatCollection) -> None: args.whatsapp_theme, args.headline ) - + # Copy files to output directory for file in glob.glob(r'*.*'): shutil.copy(file, args.output) @@ -670,23 +685,23 @@ def main(): # Set up and parse arguments parser = setup_argument_parser() args = parser.parse_args() - + # Check for updates if args.check_update: exit(check_update()) - + # Validate arguments validate_args(parser, args) - + # Create output directory if it doesn't exist os.makedirs(args.output, exist_ok=True) - + # Initialize data collection data = ChatCollection() - + # Set up contact store for vCard enrichment if needed contact_store = setup_contact_store(args) - + if args.import_json: # Import from JSON import_from_json(args.json, data) @@ -710,13 +725,13 @@ def main(): # Set default media path if not provided if args.media is None: args.media = "WhatsApp" - + # Set default DB paths if not provided if args.db is None: args.db = "msgstore.db" if args.wa is None: args.wa = "wa.db" - + # Decrypt backup if needed if args.key is not None: error = decrypt_android_backup(args) @@ -729,24 +744,26 @@ def main(): else: from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier as identifiers args.identifiers = identifiers - + # Set default media path if not provided if args.media is None: args.media = identifiers.DOMAIN - + # Extract media from backup if needed if args.backup is not None: if not os.path.isdir(args.media): - ios_media_handler.extract_media(args.backup, identifiers, args.decrypt_chunk_size) + ios_media_handler.extract_media( + args.backup, identifiers, args.decrypt_chunk_size) else: - print("WhatsApp directory already exists, skipping WhatsApp file extraction.") - + print( + "WhatsApp directory already exists, skipping WhatsApp file extraction.") + # Set default DB paths if not provided if args.db is None: args.db = identifiers.MESSAGE if args.wa is None: args.wa = "ContactsV2.sqlite" - + if args.incremental_merge: incremental_merge( args.source_dir, @@ -756,16 +773,16 @@ def main(): args.avoid_encoding_json ) print("Incremental merge completed successfully.") - else: + else: # Process contacts process_contacts(args, data, contact_store) - + # Process messages, media, and calls process_messages(args, data) - + # Create output files create_output_files(args, data, contact_store) - + # Handle media directory handle_media_directory(args) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 3dc6b24..1ebf75d 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -7,6 +7,7 @@ class Timing: """ Handles timestamp formatting with timezone support. """ + def __init__(self, timezone_offset: Optional[int]) -> None: """ Initialize Timing object. @@ -37,6 +38,7 @@ class TimeZone(tzinfo): """ Custom timezone class with fixed offset. """ + def __init__(self, offset: int) -> None: """ Initialize TimeZone object. @@ -151,6 +153,7 @@ class ChatStore: """ Stores chat information and messages. """ + def __init__(self, type: str, name: Optional[str] = None, media: Optional[str] = None) -> None: """ Initialize ChatStore object. @@ -159,7 +162,7 @@ class ChatStore: type (str): Device type (IOS or ANDROID) name (Optional[str]): Chat name media (Optional[str]): Path to media folder - + Raises: TypeError: If name is not a string or None """ @@ -182,7 +185,7 @@ class ChatStore: self.their_avatar_thumb = None self.status = None self.media_base = "" - + def __len__(self) -> int: """Get number of chats. Required for dict-like access.""" return len(self._messages) @@ -192,7 +195,7 @@ class ChatStore: if not isinstance(message, Message): raise TypeError("message must be a Message object") self._messages[id] = message - + def get_message(self, id: str) -> 'Message': """Get a message from the chat store.""" return self._messages.get(id) @@ -214,7 +217,7 @@ class ChatStore: 'media_base': self.media_base, 'messages': {id: msg.to_json() for id, msg in self._messages.items()} } - + @classmethod def from_json(cls, data: Dict) -> 'ChatStore': """Create a chat store from JSON data.""" @@ -232,7 +235,7 @@ class ChatStore: def get_last_message(self) -> 'Message': """Get the most recent message in the chat.""" return tuple(self._messages.values())[-1] - + def items(self): """Get message items pairs.""" return self._messages.items() @@ -250,11 +253,11 @@ class ChatStore: Args: other (ChatStore): The ChatStore to merge with - + """ if not isinstance(other, ChatStore): raise TypeError("Can only merge with another ChatStore object") - + # Update fields if they are not None in the other ChatStore self.name = other.name or self.name self.type = other.type or self.type @@ -262,14 +265,16 @@ class ChatStore: self.their_avatar = other.their_avatar or self.their_avatar self.their_avatar_thumb = other.their_avatar_thumb or self.their_avatar_thumb self.status = other.status or self.status - + # Merge messages self._messages.update(other._messages) + class Message: """ Represents a single message in a chat. """ + def __init__( self, *, @@ -301,7 +306,7 @@ class Message: self.from_me = bool(from_me) self.timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp timing = Timing(timezone_offset) - + if isinstance(time, (int, float)): self.time = timing.format_timestamp(self.timestamp, "%H:%M") elif isinstance(time, str): @@ -318,13 +323,15 @@ class Message: self.mime = None self.message_type = message_type if isinstance(received_timestamp, (int, float)): - self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") + self.received_timestamp = timing.format_timestamp( + received_timestamp, "%Y/%m/%d %H:%M") elif isinstance(received_timestamp, str): self.received_timestamp = received_timestamp else: self.received_timestamp = None if isinstance(read_timestamp, (int, float)): - self.read_timestamp = timing.format_timestamp(read_timestamp, "%Y/%m/%d %H:%M") + self.read_timestamp = timing.format_timestamp( + read_timestamp, "%Y/%m/%d %H:%M") elif isinstance(read_timestamp, str): self.read_timestamp = read_timestamp else: @@ -363,13 +370,13 @@ class Message: @classmethod def from_json(cls, data: Dict) -> 'Message': message = cls( - from_me = data["from_me"], - timestamp = data["timestamp"], - time = data["time"], - key_id = data["key_id"], - message_type = data.get("message_type"), - received_timestamp = data.get("received_timestamp"), - read_timestamp = data.get("read_timestamp") + from_me=data["from_me"], + timestamp=data["timestamp"], + time=data["time"], + key_id=data["key_id"], + message_type=data.get("message_type"), + received_timestamp=data.get("received_timestamp"), + read_timestamp=data.get("read_timestamp") ) message.media = data.get("media") message.meta = data.get("meta") diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index cbc67ae..49b8335 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -18,6 +18,7 @@ except ImportError: # < Python 3.11 # This should be removed when the support for Python 3.10 ends. (31 Oct 2026) from enum import Enum + class StrEnum(str, Enum): pass @@ -72,7 +73,7 @@ def bytes_to_readable(size_bytes: int) -> str: A human-readable string representing the file size. """ if size_bytes == 0: - return "0B" + return "0B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) @@ -100,7 +101,7 @@ def readable_to_bytes(size_str: str) -> int: 'TB': 1024**4, 'PB': 1024**5, 'EB': 1024**6, - 'ZB': 1024**7, + 'ZB': 1024**7, 'YB': 1024**8 } size_str = size_str.upper().strip() @@ -155,7 +156,8 @@ def check_update(): else: with raw: package_info = json.load(raw) - latest_version = tuple(map(int, package_info["info"]["version"].split("."))) + latest_version = tuple( + map(int, package_info["info"]["version"].split("."))) __version__ = importlib.metadata.version("whatsapp_chat_exporter") current_version = tuple(map(int, __version__.split("."))) if current_version < latest_version: @@ -174,17 +176,17 @@ def check_update(): def rendering( - output_file_name, - template, - name, - msgs, - contact, - w3css, - chat, - headline, - next=False, - previous=False - ): + output_file_name, + template, + name, + msgs, + contact, + w3css, + chat, + headline, + next=False, + previous=False +): if chat.their_avatar_thumb is None and chat.their_avatar is not None: their_avatar_thumb = chat.their_avatar else: @@ -256,7 +258,8 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): message.sticker = msg.get("sticker") chat.add_message(id, message) data[jid] = chat - print(f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") + print( + f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): @@ -273,39 +276,44 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p return print("JSON files found:", json_files) - + for json_file in json_files: source_path = os.path.join(source_dir, json_file) target_path = os.path.join(target_dir, json_file) - + if not os.path.exists(target_path): print(f"Copying '{json_file}' to target directory...") os.makedirs(target_dir, exist_ok=True) with open(source_path, 'rb') as src, open(target_path, 'wb') as dst: dst.write(src.read()) else: - print(f"Merging '{json_file}' with existing file in target directory...") + print( + f"Merging '{json_file}' with existing file in target directory...") with open(source_path, 'r') as src_file, open(target_path, 'r') as tgt_file: source_data = json.load(src_file) target_data = json.load(tgt_file) - + # Parse JSON into ChatStore objects using from_json() - source_chats = {jid: ChatStore.from_json(chat) for jid, chat in source_data.items()} - target_chats = {jid: ChatStore.from_json(chat) for jid, chat in target_data.items()} - + source_chats = {jid: ChatStore.from_json( + chat) for jid, chat in source_data.items()} + target_chats = {jid: ChatStore.from_json( + chat) for jid, chat in target_data.items()} + # Merge chats using merge_with() for jid, chat in source_chats.items(): if jid in target_chats: target_chats[jid].merge_with(chat) else: target_chats[jid] = chat - + # Serialize merged data - merged_data = {jid: chat.to_json() for jid, chat in target_chats.items()} - + merged_data = {jid: chat.to_json() + for jid, chat in target_chats.items()} + # Check if the merged data differs from the original target data if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): - print(f"Changes detected in '{json_file}', updating target file...") + print( + f"Changes detected in '{json_file}', updating target file...") with open(target_path, 'w') as merged_file: json.dump( merged_data, @@ -314,12 +322,14 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p ensure_ascii=not avoid_encoding_json, ) else: - print(f"No changes detected in '{json_file}', skipping update.") + print( + f"No changes detected in '{json_file}', skipping update.") # Merge media directories source_media_path = os.path.join(source_dir, media_dir) target_media_path = os.path.join(target_dir, media_dir) - print(f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") + print( + f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") if os.path.exists(source_media_path): for root, _, files in os.walk(source_media_path): relative_path = os.path.relpath(root, source_media_path) @@ -411,23 +421,29 @@ def get_chat_condition(filter: Optional[List[str]], include: bool, columns: List if filter is not None: conditions = [] if len(columns) < 2 and jid is not None: - raise ValueError("There must be at least two elements in argument columns if jid is not None") + raise ValueError( + "There must be at least two elements in argument columns if jid is not None") if jid is not None: if platform == "android": is_group = f"{jid}.type == 1" elif platform == "ios": is_group = f"{jid} IS NOT NULL" else: - raise ValueError("Only android and ios are supported for argument platform if jid is not None") + raise ValueError( + "Only android and ios are supported for argument platform if jid is not None") for index, chat in enumerate(filter): if include: - conditions.append(f"{' OR' if index > 0 else ''} {columns[0]} LIKE '%{chat}%'") + conditions.append( + f"{' OR' if index > 0 else ''} {columns[0]} LIKE '%{chat}%'") if len(columns) > 1: - conditions.append(f" OR ({columns[1]} LIKE '%{chat}%' AND {is_group})") + conditions.append( + f" OR ({columns[1]} LIKE '%{chat}%' AND {is_group})") else: - conditions.append(f"{' AND' if index > 0 else ''} {columns[0]} NOT LIKE '%{chat}%'") + conditions.append( + f"{' AND' if index > 0 else ''} {columns[0]} NOT LIKE '%{chat}%'") if len(columns) > 1: - conditions.append(f" AND ({columns[1]} NOT LIKE '%{chat}%' AND {is_group})") + conditions.append( + f" AND ({columns[1]} NOT LIKE '%{chat}%' AND {is_group})") return f"AND ({' '.join(conditions)})" else: return "" @@ -522,7 +538,7 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona else: msg = f"{old} changed their number to {new}" elif content["action_type"] == 46: - return # Voice message in PM??? Seems no need to handle. + return # Voice message in PM??? Seems no need to handle. elif content["action_type"] == 47: msg = "The contact is an official business account" elif content["action_type"] == 50: @@ -539,7 +555,8 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona elif content["action_type"] == 67: return # (PM) this contact use secure service from Facebook??? elif content["action_type"] == 69: - return # (PM) this contact use secure service from Facebook??? What's the difference with 67???? + # (PM) this contact use secure service from Facebook??? What's the difference with 67???? + return else: return # Unsupported return msg @@ -566,7 +583,8 @@ def get_status_location(output_folder: str, offline_static: str) -> str: w3css_path = os.path.join(static_folder, "w3.css") if not os.path.isfile(w3css_path): with urllib.request.urlopen(w3css) as resp: - with open(w3css_path, "wb") as f: f.write(resp.read()) + with open(w3css_path, "wb") as f: + f.write(resp.read()) w3css = os.path.join(offline_static, "w3.css") @@ -597,6 +615,7 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool template_env.filters['sanitize_except'] = sanitize_except return template_env.get_template(template_file) + # iOS Specific APPLE_TIME = 978307200 @@ -617,23 +636,31 @@ def slugify(value: str, allow_unicode: bool = False) -> str: if allow_unicode: value = unicodedata.normalize('NFKC', value) else: - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = unicodedata.normalize('NFKD', value).encode( + 'ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value.lower()) return re.sub(r'[-\s]+', '-', value).strip('-_') class WhatsAppIdentifier(StrEnum): - MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite - CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ContactsV2.sqlite - CALL = "1b432994e958845fffe8e2f190f26d1511534088" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-CallHistory.sqlite + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite + MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ContactsV2.sqlite + CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-CallHistory.sqlite + CALL = "1b432994e958845fffe8e2f190f26d1511534088" DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsApp.shared" class WhatsAppBusinessIdentifier(StrEnum): - MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ChatStorage.sqlite - CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ContactsV2.sqlite - CALL = "b463f7c4365eefc5a8723930d97928d4e907c603" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-CallHistory.sqlite - DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ChatStorage.sqlite + MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ContactsV2.sqlite + CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-CallHistory.sqlite + CALL = "b463f7c4365eefc5a8723930d97928d4e907c603" + DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" + class JidType(IntEnum): PM = 0 diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index abd1dc2..5f96afd 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -209,24 +209,24 @@ def test_incremental_merge_existing_file_with_changes(mock_filesystem): source_dir = "/source" target_dir = "/target" media_dir = "media" - + # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: True mock_filesystem["listdir"].return_value = ["chat.json"] - + # Mock file operations mock_file_content = { "/source/chat.json": json.dumps(chat_data_2), "/target/chat.json": json.dumps(chat_data_1), } - + written_chunks = [] - + def mock_file_write(data): written_chunks.append(data) - + mock_write = MagicMock(side_effect=mock_file_write) - + with patch("builtins.open", mock_open()) as mock_file: def mock_file_read(filename, mode="r"): content = mock_file_content.get(filename) @@ -234,27 +234,27 @@ def test_incremental_merge_existing_file_with_changes(mock_filesystem): if mode == 'w': file_mock.write.side_effect = mock_write return file_mock - + mock_file.side_effect = mock_file_read - + # Run the function incremental_merge(source_dir, target_dir, media_dir, 2, True) - + # Verify file operations - both files opened in text mode when target exists mock_file.assert_any_call("/source/chat.json", "r") mock_file.assert_any_call("/target/chat.json", "r") mock_file.assert_any_call("/target/chat.json", "w") - + # Verify write was called assert mock_write.called, "Write method was never called" - + # Combine chunks and parse JSON written_data = json.loads(''.join(written_chunks)) - + # Verify the merged data is correct assert written_data is not None, "No data was written" assert written_data == chat_data_merged, "Merged data does not match expected result" - + # Verify specific message retention messages = written_data["12345678@s.whatsapp.net"]["messages"] assert "24690" in messages, "Common message should be present" @@ -292,7 +292,8 @@ def test_incremental_merge_existing_file_no_changes(mock_filesystem): incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify no write operations occurred on target file - write_calls = [call for call in mock_file.mock_calls if call[0] == "().write"] + write_calls = [ + call for call in mock_file.mock_calls if call[0] == "().write"] assert len(write_calls) == 0 @@ -333,4 +334,5 @@ def test_incremental_merge_media_copy(mock_filesystem): assert ( mock_filesystem["makedirs"].call_count >= 2 ) # At least target dir and media dir - assert mock_filesystem["copy2"].call_count == 2 # Two media files copied + # Two media files copied + assert mock_filesystem["copy2"].call_count == 2 From dad7666adb68feb3a29091a157498828ebe3e030 Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 5 May 2025 12:32:29 -0400 Subject: [PATCH 027/108] Updated to also use shutil for JSON file copying --- Whatsapp_Chat_Exporter/utility.py | 3 +-- tests/test_incremental_merge.py | 33 +++++++++++-------------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 49b8335..b6f7bc2 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -284,8 +284,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p if not os.path.exists(target_path): print(f"Copying '{json_file}' to target directory...") os.makedirs(target_dir, exist_ok=True) - with open(source_path, 'rb') as src, open(target_path, 'wb') as dst: - dst.write(src.read()) + shutil.copy2(source_path, target_path) else: print( f"Merging '{json_file}' with existing file in target directory...") diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index 5f96afd..34f319e 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -177,31 +177,20 @@ def test_incremental_merge_new_file(mock_filesystem): source_dir = "/source" target_dir = "/target" media_dir = "media" - + # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: x == "/source" mock_filesystem["listdir"].return_value = ["chat.json"] - - # Mock file operations - mock_file_content = { - "/source/chat.json": json.dumps(chat_data_1), - } - - with patch("builtins.open", mock_open()) as mock_file: - - def mock_file_read(filename, mode="r"): - content = mock_file_content.get(filename) - file_mock = mock_open(read_data=content).return_value - return file_mock - - mock_file.side_effect = mock_file_read - - # Run the function - incremental_merge(source_dir, target_dir, media_dir, 2, True) - - # Verify the operations - mock_file.assert_any_call("/source/chat.json", "rb") - mock_file.assert_any_call("/target/chat.json", "wb") + + # Run the function + incremental_merge(source_dir, target_dir, media_dir, 2, True) + + # Verify the operations + mock_filesystem["makedirs"].assert_called_once_with(target_dir, exist_ok=True) + mock_filesystem["copy2"].assert_called_once_with( + os.path.join(source_dir, "chat.json"), + os.path.join(target_dir, "chat.json") + ) def test_incremental_merge_existing_file_with_changes(mock_filesystem): From db42ad123da0b7ad33d78835fabf84bfcb97ee0b Mon Sep 17 00:00:00 2001 From: fschuh Date: Mon, 5 May 2025 15:53:13 -0400 Subject: [PATCH 028/108] Fixed unit tests so they no longer fail on Windows --- tests/test_incremental_merge.py | 112 ++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 49 deletions(-) diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index 34f319e..39f6c19 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -6,12 +6,13 @@ from Whatsapp_Chat_Exporter.utility import incremental_merge from Whatsapp_Chat_Exporter.data_model import ChatStore # Test data setup +BASE_PATH = "AppDomainGroup-group.net.whatsapp.WhatsApp.shared" chat_data_1 = { "12345678@s.whatsapp.net": { "name": "Friend", "type": "ios", - "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", - "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), "their_avatar_thumb": None, "status": None, "messages": { @@ -45,8 +46,8 @@ chat_data_2 = { "12345678@s.whatsapp.net": { "name": "Friend", "type": "ios", - "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", - "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), "their_avatar_thumb": None, "status": None, "messages": { @@ -81,8 +82,8 @@ chat_data_merged = { "12345678@s.whatsapp.net": { "name": "Friend", "type": "ios", - "my_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared\\Media/Profile/Photo.jpg", - "their_avatar": "AppDomainGroup-group.net.whatsapp.WhatsApp.shared/Media/Profile\\12345678-1709851420.thumb", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), "their_avatar_thumb": None, "status": None, "media_base": None, @@ -195,18 +196,20 @@ def test_incremental_merge_new_file(mock_filesystem): def test_incremental_merge_existing_file_with_changes(mock_filesystem): """Test merging when target file exists and has changes""" - source_dir = "/source" - target_dir = "/target" + source_dir = "source" + target_dir = "target" media_dir = "media" # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: True mock_filesystem["listdir"].return_value = ["chat.json"] - # Mock file operations + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") mock_file_content = { - "/source/chat.json": json.dumps(chat_data_2), - "/target/chat.json": json.dumps(chat_data_1), + source_file: json.dumps(chat_data_2), + target_file: json.dumps(chat_data_1), } written_chunks = [] @@ -218,33 +221,33 @@ def test_incremental_merge_existing_file_with_changes(mock_filesystem): with patch("builtins.open", mock_open()) as mock_file: def mock_file_read(filename, mode="r"): - content = mock_file_content.get(filename) - file_mock = mock_open(read_data=content).return_value if mode == 'w': + file_mock = mock_open().return_value file_mock.write.side_effect = mock_write - return file_mock + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock mock_file.side_effect = mock_file_read # Run the function incremental_merge(source_dir, target_dir, media_dir, 2, True) - # Verify file operations - both files opened in text mode when target exists - mock_file.assert_any_call("/source/chat.json", "r") - mock_file.assert_any_call("/target/chat.json", "r") - mock_file.assert_any_call("/target/chat.json", "w") + # Verify file operations using os.path.join + mock_file.assert_any_call(source_file, "r") + mock_file.assert_any_call(target_file, "r") + mock_file.assert_any_call(target_file, "w") - # Verify write was called + # Rest of verification code... assert mock_write.called, "Write method was never called" - - # Combine chunks and parse JSON written_data = json.loads(''.join(written_chunks)) - - # Verify the merged data is correct assert written_data is not None, "No data was written" assert written_data == chat_data_merged, "Merged data does not match expected result" - # Verify specific message retention messages = written_data["12345678@s.whatsapp.net"]["messages"] assert "24690" in messages, "Common message should be present" assert "24691" in messages, "Target-only message should be preserved" @@ -254,26 +257,33 @@ def test_incremental_merge_existing_file_with_changes(mock_filesystem): def test_incremental_merge_existing_file_no_changes(mock_filesystem): """Test merging when target file exists but has no changes""" - source_dir = "/source" - target_dir = "/target" + source_dir = "source" + target_dir = "target" media_dir = "media" # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: True mock_filesystem["listdir"].return_value = ["chat.json"] - # Mock file operations + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") mock_file_content = { - "/source/chat.json": json.dumps(chat_data_1), - "/target/chat.json": json.dumps(chat_data_1), + source_file: json.dumps(chat_data_1), + target_file: json.dumps(chat_data_1), } with patch("builtins.open", mock_open()) as mock_file: - def mock_file_read(filename, mode="r"): - content = mock_file_content.get(filename) - file_mock = mock_open(read_data=content).return_value - return file_mock + if mode == 'w': + file_mock = mock_open().return_value + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock mock_file.side_effect = mock_file_read @@ -288,31 +298,38 @@ def test_incremental_merge_existing_file_no_changes(mock_filesystem): def test_incremental_merge_media_copy(mock_filesystem): """Test media file copying during merge""" - source_dir = "/source" - target_dir = "/target" + source_dir = "source" + target_dir = "target" media_dir = "media" # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: True mock_filesystem["listdir"].return_value = ["chat.json"] mock_filesystem["walk"].return_value = [ - ("/source/media", ["subfolder"], ["file1.jpg"]), - ("/source/media/subfolder", [], ["file2.jpg"]), + (os.path.join(source_dir, "media"), ["subfolder"], ["file1.jpg"]), + (os.path.join(source_dir, "media", "subfolder"), [], ["file2.jpg"]), ] mock_filesystem["getmtime"].side_effect = lambda x: 1000 if "source" in x else 500 - # Mock file operations + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") mock_file_content = { - "/source/chat.json": json.dumps(chat_data_1), - "/target/chat.json": json.dumps(chat_data_1), + source_file: json.dumps(chat_data_1), + target_file: json.dumps(chat_data_1), } with patch("builtins.open", mock_open()) as mock_file: - def mock_file_read(filename, mode="r"): - content = mock_file_content.get(filename) - file_mock = mock_open(read_data=content).return_value - return file_mock + if mode == 'w': + file_mock = mock_open().return_value + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock mock_file.side_effect = mock_file_read @@ -320,8 +337,5 @@ def test_incremental_merge_media_copy(mock_filesystem): incremental_merge(source_dir, target_dir, media_dir, 2, True) # Verify media file operations - assert ( - mock_filesystem["makedirs"].call_count >= 2 - ) # At least target dir and media dir - # Two media files copied - assert mock_filesystem["copy2"].call_count == 2 + assert mock_filesystem["makedirs"].call_count >= 2 # At least target dir and media dir + assert mock_filesystem["copy2"].call_count == 2 # Two media files copied From 170a108109e75a2d0d32cd4dc791b12385ca1912 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 21:31:03 +0800 Subject: [PATCH 029/108] Bug fix on incorrectly normalized number that starts with 0 --- Whatsapp_Chat_Exporter/vcards_contacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/vcards_contacts.py b/Whatsapp_Chat_Exporter/vcards_contacts.py index ea38371..d2d566b 100644 --- a/Whatsapp_Chat_Exporter/vcards_contacts.py +++ b/Whatsapp_Chat_Exporter/vcards_contacts.py @@ -77,6 +77,6 @@ def normalize_number(number: str, country_code: str): return number[len(starting_char):] # leading zero should be removed - if starting_char == '0': + if number.startswith('0'): number = number[1:] return country_code + number # fall back From e2b773eac5a7a2344df54118fefaaa428f70d925 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 21:31:29 +0800 Subject: [PATCH 030/108] Move all tests to single directory --- tests/__init__.py | 0 tests/data/contacts.vcf | 5 +++++ .../test_brazilian_number_processing.py | 4 ++-- .../vcards_contacts_test.py => tests/test_vcards_contacts.py | 4 +++- 4 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/data/contacts.vcf rename scripts/brazilian_number_processing_test.py => tests/test_brazilian_number_processing.py (98%) rename Whatsapp_Chat_Exporter/vcards_contacts_test.py => tests/test_vcards_contacts.py (79%) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/contacts.vcf b/tests/data/contacts.vcf new file mode 100644 index 0000000..12a026b --- /dev/null +++ b/tests/data/contacts.vcf @@ -0,0 +1,5 @@ +BEGIN:VCARD +VERSION:3.0 +FN:Sample Contact +TEL;TYPE=CELL:+85288888888 +END:VCARD \ No newline at end of file diff --git a/scripts/brazilian_number_processing_test.py b/tests/test_brazilian_number_processing.py similarity index 98% rename from scripts/brazilian_number_processing_test.py rename to tests/test_brazilian_number_processing.py index 8c6a38b..f7ccd27 100644 --- a/scripts/brazilian_number_processing_test.py +++ b/tests/test_brazilian_number_processing.py @@ -4,7 +4,7 @@ import tempfile import os from unittest.mock import patch -from brazilian_number_processing import process_phone_number, process_vcard +from scripts.brazilian_number_processing import process_phone_number, process_vcard class TestVCardProcessor(unittest.TestCase): @@ -248,7 +248,7 @@ END:VCARD output_path = input_path + '.out' try: - test_args = ['python' if os.name == 'nt' else 'python3', 'brazilian_number_processing.py', input_path, output_path] + test_args = ['python' if os.name == 'nt' else 'python3', 'scripts/brazilian_number_processing.py', input_path, output_path] # We're just testing that the argument parsing works subprocess.call( test_args, diff --git a/Whatsapp_Chat_Exporter/vcards_contacts_test.py b/tests/test_vcards_contacts.py similarity index 79% rename from Whatsapp_Chat_Exporter/vcards_contacts_test.py rename to tests/test_vcards_contacts.py index 194b637..5d22b56 100644 --- a/Whatsapp_Chat_Exporter/vcards_contacts_test.py +++ b/tests/test_vcards_contacts.py @@ -1,10 +1,12 @@ # from contacts_names_from_vcards import readVCardsFile +import os from Whatsapp_Chat_Exporter.vcards_contacts import normalize_number, read_vcards_file def test_readVCardsFile(): - assert len(read_vcards_file("contacts.vcf", "973")) > 0 + data_dir = os.path.join(os.path.dirname(__file__), "data") + assert len(read_vcards_file(os.path.join(data_dir, "contacts.vcf"), "852")) > 0 def test_create_number_to_name_dicts(): pass From a3294ead11105d9397aeab8fad0b0dae2e87e8f3 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 21:45:45 +0800 Subject: [PATCH 031/108] Add a basic sanity check for the exporter The check make sure all modules can be imported and the exporter can at least run without any arguments provided. --- tests/test_exporter.py | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 tests/test_exporter.py diff --git a/tests/test_exporter.py b/tests/test_exporter.py new file mode 100644 index 0000000..2a38b26 --- /dev/null +++ b/tests/test_exporter.py @@ -0,0 +1,50 @@ +import subprocess +import pytest + + +@pytest.fixture +def command_runner(): + """ + A pytest fixture to simplify running commands. This is a helper + function that you can use in multiple tests. + """ + def _run_command(command_list, check=True): + """ + Runs a command and returns the result. + + Args: + command_list (list): A list of strings representing the command + and its arguments (e.g., ["python", "my_script.py", "arg1"]). + check (bool, optional): If True, raise an exception if the + command returns a non-zero exit code. Defaults to True. + + Returns: + subprocess.CompletedProcess: The result of the command. + """ + return subprocess.run( + command_list, + capture_output=True, + text=True, + check=check, + ) + return _run_command + + +def test_sanity_check(command_runner): + """ + This is a basic sanity check to make sure all modules can be imported + This runs the exporter without any arguments. It should fail with a + message about missing arguments. + """ + result = command_runner(["wtsexporter"], False) + expected_stderr = "You must define the device type" + assert expected_stderr in result.stderr, f"STDERR was: {result.stderr}" + assert result.returncode == 2 + + +def test_android(command_runner): + ... + + +def test_ios(command_runner): + ... From 0bb4f52a26af485935b7db9b67bcdb424526d9b3 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 21:46:19 +0800 Subject: [PATCH 032/108] Add CI --- .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..04434a2 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: Run Pytest on Dev Branch Push + +on: + push: + branches: + - dev + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.9, 3.10, 3.11, 3.12, 3.13] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[all] pytest + + - name: Run pytest + run: pytest # Execute your pytest tests From 3e37bbb021128eacb51aba6e1fb1c42f20cb684c Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:28:48 +0800 Subject: [PATCH 033/108] Create test_nuitka_binary.py --- tests/test_nuitka_binary.py | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tests/test_nuitka_binary.py diff --git a/tests/test_nuitka_binary.py b/tests/test_nuitka_binary.py new file mode 100644 index 0000000..45c4a12 --- /dev/null +++ b/tests/test_nuitka_binary.py @@ -0,0 +1,74 @@ +import os +import pytest +import subprocess + + +@pytest.fixture +def command_runner(): + """ + A pytest fixture to simplify running commands. This is a helper + function that you can use in multiple tests. + """ + def _run_command(command_list, check=True): + """ + Runs a command and returns the result. + + Args: + command_list (list): A list of strings representing the command + and its arguments (e.g., ["python", "my_script.py", "arg1"]). + check (bool, optional): If True, raise an exception if the + command returns a non-zero exit code. Defaults to True. + + Returns: + subprocess.CompletedProcess: The result of the command. + """ + return subprocess.run( + command_list, + capture_output=True, + text=True, + check=check, + ) + return _run_command + + +def test_nuitka_binary(): + """ + Tests the creation and execution of a Nuitka-compiled binary. + """ + nuitka_command = [ + "python", "-m", "nuitka", + "--no-deployment-flag=self-execution", + "--onefile", + "--include-data-file=./Whatsapp_Chat_Exporter/whatsapp.html=./Whatsapp_Chat_Exporter/whatsapp.html", + "--assume-yes-for-downloads", + "--follow-imports", + "Whatsapp_Chat_Exporter/__main__.py", + "--output-filename=wtsexporter.exe" # use .exe on all platforms for compatibility + ] + + compile_result = subprocess.run( + nuitka_command, + capture_output=True, + text=True, + check=True + ) + print(f"Nuitka compilation output: {compile_result.stdout}") + + binary_path = "wtsexporter.exe" + assert os.path.exists(binary_path), f"Binary {binary_path} was not created." + + try: + execute_result = subprocess.run( + [binary_path, "--help"], + capture_output=True, + text=True, + check=True, + ) + print(f"Binary execution output: {execute_result.stdout}") + assert "usage:" in execute_result.stdout.lower(), "Binary did not produce expected help output." + except subprocess.CalledProcessError as e: + print(f"Binary execution failed with error: {e.stderr}") + raise + finally: + if os.path.exists(binary_path): + os.remove(binary_path) From c346199d0536c90fe25d34ed9556ca4dc765882b Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:30:04 +0800 Subject: [PATCH 034/108] Fix python versions in ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 04434a2..033760b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, 3.10, 3.11, 3.12, 3.13] + python-version: [3.9, "3.10", "3.11", "3.12", "3.13"] steps: - name: Checkout code From 520f31651cbe8afd6b223f9f432f479991320a6d Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:31:11 +0800 Subject: [PATCH 035/108] Forgot to install nuitka --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 033760b..017157e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[all] pytest + pip install .[all] pytest nuitka - name: Run pytest run: pytest # Execute your pytest tests From 904f44dc12ad217de9ff8773e82d5a2e8b315966 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:40:28 +0800 Subject: [PATCH 036/108] Update test_nuitka_binary.py --- tests/test_nuitka_binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_nuitka_binary.py b/tests/test_nuitka_binary.py index 45c4a12..0e91930 100644 --- a/tests/test_nuitka_binary.py +++ b/tests/test_nuitka_binary.py @@ -54,7 +54,7 @@ def test_nuitka_binary(): ) print(f"Nuitka compilation output: {compile_result.stdout}") - binary_path = "wtsexporter.exe" + binary_path = "./wtsexporter.exe" assert os.path.exists(binary_path), f"Binary {binary_path} was not created." try: From 80cb868beb882a043386f9a3c10a7cf517164a0d Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:49:06 +0800 Subject: [PATCH 037/108] Expend all tests to all common systems --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 017157e..116341a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,16 +7,17 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: matrix: + os: [ubuntu-latest, windows-latest, macos-latest] python-version: [3.9, "3.10", "3.11", "3.12", "3.13"] steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -27,4 +28,4 @@ jobs: pip install .[all] pytest nuitka - name: Run pytest - run: pytest # Execute your pytest tests + run: pytest From 907fe4aa91c481991fdeb156c2d2728c968b600d Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Wed, 7 May 2025 22:56:52 +0800 Subject: [PATCH 038/108] Update ci.yml --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 116341a..9b9d528 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,9 +6,10 @@ on: - dev jobs: - build: + ci: runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] python-version: [3.9, "3.10", "3.11", "3.12", "3.13"] From 0681661660ef242b343eacc129d3e2d98665dc92 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 8 May 2025 00:51:09 +0800 Subject: [PATCH 039/108] Update bruteforce_crypt15.py --- scripts/bruteforce_crypt15.py | 40 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/scripts/bruteforce_crypt15.py b/scripts/bruteforce_crypt15.py index 852176c..c8646d4 100644 --- a/scripts/bruteforce_crypt15.py +++ b/scripts/bruteforce_crypt15.py @@ -26,24 +26,24 @@ def _extract_encrypted_key(keyfile): return _generate_hmac_of_hmac(key_stream) +if __name__ == "__main__": + key = open("encrypted_backup.key", "rb").read() + database = open("wa.db.crypt15", "rb").read() + main_key, hex_key = _extract_encrypted_key(key) + for i in range(100): + iv = database[i:i+16] + for j in range(100): + cipher = AES.new(main_key, AES.MODE_GCM, iv) + db_ciphertext = database[j:] + db_compressed = cipher.decrypt(db_ciphertext) + try: + db = zlib.decompress(db_compressed) + except zlib.error: + ... + else: + if db[0:6] == b"SQLite": + print(f"Found!\nIV: {i}\nOffset: {j}") + print(db_compressed[:10]) + exit() -key = open("encrypted_backup.key", "rb").read() -database = open("wa.db.crypt15", "rb").read() -main_key, hex_key = _extract_encrypted_key(key) -for i in range(100): - iv = database[i:i+16] - for j in range(100): - cipher = AES.new(main_key, AES.MODE_GCM, iv) - db_ciphertext = database[j:] - db_compressed = cipher.decrypt(db_ciphertext) - try: - db = zlib.decompress(db_compressed) - except zlib.error: - ... - else: - if db[0:6] == b"SQLite": - print(f"Found!\nIV: {i}\nOffset: {j}") - print(db_compressed[:10]) - exit() - -print("Not found! Try to increase maximum search.") + print("Not found! Try to increase maximum search.") From fa4157275385c91d76b0f3af27ea0751597d61b0 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 16:53:46 +0800 Subject: [PATCH 040/108] Change print to logger for better logging in the future This commit also added --debug and --no-banner options, which will enable debug level of logging and supress the default banner --- Whatsapp_Chat_Exporter/__main__.py | 113 +++++++++++++++----- Whatsapp_Chat_Exporter/android_crypt.py | 50 +++++++-- Whatsapp_Chat_Exporter/android_handler.py | 38 ++++--- Whatsapp_Chat_Exporter/exported_handler.py | 10 +- Whatsapp_Chat_Exporter/ios_handler.py | 34 +++--- Whatsapp_Chat_Exporter/ios_media_handler.py | 45 ++++---- Whatsapp_Chat_Exporter/utility.py | 46 ++++---- 7 files changed, 226 insertions(+), 110 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 51bbabf..9279812 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -7,11 +7,12 @@ import shutil import json import string import glob +import logging import importlib.metadata from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, check_update, DbType +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update, DbType from Whatsapp_Chat_Exporter.utility import readable_to_bytes, sanitize_filename from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, bytes_to_readable from argparse import ArgumentParser, SUPPRESS @@ -30,16 +31,43 @@ else: vcards_deps_installed = True +logger = logging.getLogger(__name__) +__version__ = importlib.metadata.version("whatsapp_chat_exporter") +WTSEXPORTER_BANNER = f"""======================================================================================================== + ██╗ ██╗██╗ ██╗ █████╗ ████████╗███████╗ █████╗ ██████╗ ██████╗ + ██║ ██║██║ ██║██╔══██╗╚══██╔══╝██╔════╝██╔══██╗██╔══██╗██╔══██╗ + ██║ █╗ ██║███████║███████║ ██║ ███████╗███████║██████╔╝██████╔╝ + ██║███╗██║██╔══██║██╔══██║ ██║ ╚════██║██╔══██║██╔═══╝ ██╔═══╝ + ╚███╔███╔╝██║ ██║██║ ██║ ██║ ███████║██║ ██║██║ ██║ + ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝ + + ██████╗██╗ ██╗ █████╗ ████████╗ ███████╗██╗ ██╗██████╗ ██████╗ ██████╗ ████████╗███████╗██████╗ +██╔════╝██║ ██║██╔══██╗╚══██╔══╝ ██╔════╝╚██╗██╔╝██╔══██╗██╔═══██╗██╔══██╗╚══██╔══╝██╔════╝██╔══██╗ +██║ ███████║███████║ ██║ █████╗ ╚███╔╝ ██████╔╝██║ ██║██████╔╝ ██║ █████╗ ██████╔╝ +██║ ██╔══██║██╔══██║ ██║ ██╔══╝ ██╔██╗ ██╔═══╝ ██║ ██║██╔══██╗ ██║ ██╔══╝ ██╔══██╗ +╚██████╗██║ ██║██║ ██║ ██║ ███████╗██╔╝ ██╗██║ ╚██████╔╝██║ ██║ ██║ ███████╗██║ ██║ + ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ + + WhatsApp Chat Exporter: A customizable Android and iOS/iPadOS WhatsApp database parser + Version: {__version__} +========================================================================================================""" + + def setup_argument_parser() -> ArgumentParser: """Set up and return the argument parser with all options.""" parser = ArgumentParser( description='A customizable Android and iOS/iPadOS WhatsApp database parser that ' 'will give you the history of your WhatsApp conversations in HTML ' 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', - epilog=f'WhatsApp Chat Exporter: {importlib.metadata.version("whatsapp_chat_exporter")} Licensed with MIT. See ' + epilog=f'WhatsApp Chat Exporter: {__version__} Licensed with MIT. See ' 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' ) + # General options + parser.add_argument( + "--debug", dest="debug", default=False, action='store_true', + help="Enable debug mode" + ) # Device type arguments device_group = parser.add_argument_group('Device Type') device_group.add_argument( @@ -260,6 +288,10 @@ def setup_argument_parser() -> ArgumentParser: "--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int, help="Specify the maximum number of worker for bruteforce decryption." ) + misc_group.add_argument( + "--no-banner", dest="no_banner", default=False, action='store_true', + help="Do not show the banner" + ) return parser @@ -391,10 +423,10 @@ def setup_contact_store(args) -> Optional['ContactsFromVCards']: """Set up and return a contact store if needed.""" if args.enrich_from_vcards is not None: if not vcards_deps_installed: - print( + logger.error( "You don't have the dependency to enrich contacts with vCard.\n" "Read more on how to deal with enriching contacts:\n" - "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage\n" ) exit(1) contact_store = ContactsFromVCards() @@ -407,10 +439,10 @@ def setup_contact_store(args) -> Optional['ContactsFromVCards']: def decrypt_android_backup(args) -> int: """Decrypt Android backup files and return error code.""" if args.key is None or args.backup is None: - print("You must specify the backup file with -b and a key with -k") + logger.error(f"You must specify the backup file with -b and a key with -k{CLEAR_LINE}") return 1 - print("Decryption key specified, decrypting WhatsApp backup...") + logger.info(f"Decryption key specified, decrypting WhatsApp backup...{CLEAR_LINE}") # Determine crypt type if "crypt12" in args.backup: @@ -420,7 +452,7 @@ def decrypt_android_backup(args) -> int: elif "crypt15" in args.backup: crypt = Crypt.CRYPT15 else: - print("Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.") + logger.error(f"Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.{CLEAR_LINE}") return 1 # Get key @@ -472,15 +504,15 @@ def decrypt_android_backup(args) -> int: def handle_decrypt_error(error: int) -> None: """Handle decryption errors with appropriate messages.""" if error == 1: - print("Dependencies of decrypt_backup and/or extract_encrypted_key" - " are not present. For details, see README.md.") + logger.error("Dependencies of decrypt_backup and/or extract_encrypted_key" + " are not present. For details, see README.md.\n") exit(3) elif error == 2: - print("Failed when decompressing the decrypted backup. " - "Possibly incorrect offsets used in decryption.") + logger.error("Failed when decompressing the decrypted backup. " + "Possibly incorrect offsets used in decryption.\n") exit(4) else: - print("Unknown error occurred.", error) + logger.error("Unknown error occurred.\n") exit(5) @@ -502,9 +534,9 @@ def process_messages(args, data: ChatCollection) -> None: msg_db = args.db if args.db else "msgstore.db" if args.android else args.identifiers.MESSAGE if not os.path.isfile(msg_db): - print( + logger.error( "The message database does not exist. You may specify the path " - "to database file with option -d or check your provided path." + "to database file with option -d or check your provided path.\n" ) exit(6) @@ -556,19 +588,21 @@ def handle_media_directory(args) -> None: media_path = os.path.join(args.output, args.media) if os.path.isdir(media_path): - print( - "\nWhatsApp directory already exists in output directory. Skipping...", end="\n") + logger.info( + f"WhatsApp directory already exists in output directory. Skipping...{CLEAR_LINE}") else: if args.move_media: try: - print("\nMoving media directory...", end="\n") + logger.info(f"Moving media directory...\r") shutil.move(args.media, f"{args.output}/") + logger.info(f"Media directory has been moved to the output directory{CLEAR_LINE}") except PermissionError: - print("\nCannot remove original WhatsApp directory. " - "Perhaps the directory is opened?", end="\n") + logger.warning("Cannot remove original WhatsApp directory. " + "Perhaps the directory is opened?\n") else: - print("\nCopying media directory...", end="\n") + logger.info(f"Copying media directory...\r") shutil.copytree(args.media, media_path) + logger.info(f"Media directory has been copied to the output directory{CLEAR_LINE}") def create_output_files(args, data: ChatCollection, contact_store=None) -> None: @@ -593,7 +627,7 @@ def create_output_files(args, data: ChatCollection, contact_store=None) -> None: # Create text files if requested if args.text_format: - print("Writing text file...") + logger.info(f"Writing text file...{CLEAR_LINE}") android_handler.create_txt(data, args.text_format) # Create JSON files if requested @@ -626,8 +660,9 @@ def export_single_json(args, data: Dict) -> None: ensure_ascii=not args.avoid_encoding_json, indent=args.pretty_print_json ) - print(f"\nWriting JSON file...({bytes_to_readable(len(json_data))})") + logger.info(f"Writing JSON file...\r") f.write(json_data) + logger.info(f"JSON file saved...({bytes_to_readable(len(json_data))}){CLEAR_LINE}") def export_multiple_json(args, data: Dict) -> None: @@ -654,8 +689,7 @@ def export_multiple_json(args, data: Dict) -> None: indent=args.pretty_print_json ) f.write(file_content) - print(f"Writing JSON file...({index + 1}/{total})", end="\r") - print() + logger.info(f"Writing JSON file...({index + 1}/{total})\r") def process_exported_chat(args, data: ChatCollection) -> None: @@ -680,6 +714,19 @@ def process_exported_chat(args, data: ChatCollection) -> None: shutil.copy(file, args.output) +def setup_logging(level): + log_handler_stdout = logging.StreamHandler() + log_handler_stdout.terminator = "" + handlers = [log_handler_stdout] + if level == logging.DEBUG: + handlers.append(logging.FileHandler("debug.log", mode="w")) + logging.basicConfig( + level=level, + format="[%(levelname)s] %(message)s", + handlers=handlers + ) + + def main(): """Main function to run the WhatsApp Chat Exporter.""" # Set up and parse arguments @@ -693,6 +740,16 @@ def main(): # Validate arguments validate_args(parser, args) + # Print banner if not suppressed + if not args.no_banner: + print(WTSEXPORTER_BANNER) + + if args.debug: + setup_logging(logging.DEBUG) + logger.debug("Debug mode enabled.\n") + else: + setup_logging(logging.INFO) + # Create output directory if it doesn't exist os.makedirs(args.output, exist_ok=True) @@ -755,8 +812,8 @@ def main(): ios_media_handler.extract_media( args.backup, identifiers, args.decrypt_chunk_size) else: - print( - "WhatsApp directory already exists, skipping WhatsApp file extraction.") + logger.info( + f"WhatsApp directory already exists, skipping WhatsApp file extraction.{CLEAR_LINE}") # Set default DB paths if not provided if args.db is None: @@ -772,7 +829,7 @@ def main(): args.pretty_print_json, args.avoid_encoding_json ) - print("Incremental merge completed successfully.") + logger.info(f"Incremental merge completed successfully.{CLEAR_LINE}") else: # Process contacts process_contacts(args, data, contact_store) @@ -786,7 +843,7 @@ def main(): # Handle media directory handle_media_directory(args) - print("Everything is done!") + logger.info("Everything is done!") if __name__ == "__main__": diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py index 84e629e..68f0120 100644 --- a/Whatsapp_Chat_Exporter/android_crypt.py +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -1,11 +1,14 @@ +import time import hmac import io +import logging +import threading import zlib import concurrent.futures from typing import Tuple, Union from hashlib import sha256 from sys import exit -from Whatsapp_Chat_Exporter.utility import CRYPT14_OFFSETS, Crypt, DbType +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CRYPT14_OFFSETS, Crypt, DbType try: import zlib @@ -23,6 +26,9 @@ else: support_crypt15 = True +logger = logging.getLogger(__name__) + + class DecryptionError(Exception): """Base class for decryption-related exceptions.""" pass @@ -138,11 +144,28 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> iv = database[offsets["iv"]:offsets["iv"] + 16] db_ciphertext = database[offsets["db"]:] try: - return _decrypt_database(db_ciphertext, main_key, iv) + decrypted_db = _decrypt_database(db_ciphertext, main_key, iv) except (zlib.error, ValueError): pass # Try next offset + else: + logger.debug( + f"Decryption successful with known offsets: IV {offsets["iv"]}, DB {offsets["db"]}{CLEAR_LINE}" + ) + return decrypted_db # Successful decryption - print("Common offsets failed. Initiating brute-force with multithreading...") + def animate_message(stop_event): + base_msg = "Common offsets failed. Initiating brute-force with multithreading" + dots = ["", ".", "..", "..."] + i = 0 + while not stop_event.is_set(): + logger.info(f"{base_msg}{dots[i % len(dots)]}\x1b[K\r") + time.sleep(0.3) + i += 1 + logger.info(f"Common offsets failed but brute-forcing the offset works!{CLEAR_LINE}") + + stop_event = threading.Event() + anim_thread = threading.Thread(target=animate_message, args=(stop_event,)) + anim_thread.start() # Convert brute force generator into a list for parallel processing offset_combinations = list(brute_force_offset()) @@ -152,19 +175,23 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> start_iv, end_iv, start_db = offset_tuple iv = database[start_iv:end_iv] db_ciphertext = database[start_db:] + logger.debug(""f"Trying offsets: IV {start_iv}-{end_iv}, DB {start_db}{CLEAR_LINE}") try: db = _decrypt_database(db_ciphertext, main_key, iv) - print( + except (zlib.error, ValueError): + return None # Decryption failed, move to next + else: + stop_event.set() + anim_thread.join() + logger.info( f"The offsets of your IV and database are {start_iv} and " f"{start_db}, respectively. To include your offsets in the " "program, please report it by creating an issue on GitHub: " "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47" - "\nShutting down other threads..." + f"\nShutting down other threads...{CLEAR_LINE}" ) return db - except (zlib.error, ValueError): - return None # Decryption failed, move to next with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: future_to_offset = {executor.submit(attempt_decrypt, offset): offset for offset in offset_combinations} @@ -178,9 +205,14 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> return result except KeyboardInterrupt: - print("\nBrute force interrupted by user (Ctrl+C). Exiting gracefully...") + stop_event.set() + anim_thread.join() + logger.info(f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}") executor.shutdown(wait=False, cancel_futures=True) exit(1) + finally: + stop_event.set() + anim_thread.join() raise OffsetNotFoundError("Could not find the correct offsets for decryption.") @@ -305,7 +337,7 @@ def decrypt_backup( main_key, hex_key = _derive_main_enc_key(key) if show_crypt15: hex_key_str = ' '.join([hex_key.hex()[c:c+4] for c in range(0, len(hex_key.hex()), 4)]) - print(f"The HEX key of the crypt15 backup is: {hex_key_str}") + logger.info(f"The HEX key of the crypt15 backup is: {hex_key_str}{CLEAR_LINE}") else: main_key = key[126:] diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index 5133d6c..cd364e7 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -1,5 +1,6 @@ #!/usr/bin/python3 +import logging import sqlite3 import os import shutil @@ -9,12 +10,15 @@ from markupsafe import escape as htmle from base64 import b64decode, b64encode from datetime import datetime from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable +logger = logging.getLogger(__name__) + + def contacts(db, data, enrich_from_vcards): """ Process WhatsApp contacts from the database. @@ -33,12 +37,12 @@ def contacts(db, data, enrich_from_vcards): if total_row_number == 0: if enrich_from_vcards is not None: - print("No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") + logger.info("No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") else: - print("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") + logger.warning("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") return False else: - print(f"Processing contacts...({total_row_number})") + logger.info(f"Processed {total_row_number} contacts\n") c.execute("SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts;") row = c.fetchone() @@ -66,7 +70,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c = db.cursor() total_row_number = _get_message_count(c, filter_empty, filter_date, filter_chat) - print(f"Processing messages...(0/{total_row_number})", end="\r") + logger.info(f"Processing messages...(0/{total_row_number})\r") try: content_cursor = _get_messages_cursor_legacy(c, filter_empty, filter_date, filter_chat) @@ -87,12 +91,12 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, i += 1 if i % 1000 == 0: - print(f"Processing messages...({i}/{total_row_number})", end="\r") + logger.info(f"Processing messages...({i}/{total_row_number})\r") # Fetch the next row safely content = _fetch_row_safely(content_cursor) - print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") # Helper functions for message processing @@ -482,7 +486,7 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa """ c = db.cursor() total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat) - print(f"\nProcessing media...(0/{total_row_number})", end="\r") + logger.info(f"Processing media...(0/{total_row_number})\r") try: content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat) @@ -501,11 +505,11 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa i += 1 if i % 100 == 0: - print(f"Processing media...({i}/{total_row_number})", end="\r") + logger.info(f"Processing media...({i}/{total_row_number})\r") content = content_cursor.fetchone() - print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") # Helper functions for media processing @@ -676,7 +680,7 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): rows = _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty) total_row_number = len(rows) - print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") + logger.info(f"Processing vCards...(0/{total_row_number})\r") # Create vCards directory if it doesn't exist path = os.path.join(media_folder, "vCards") @@ -684,7 +688,8 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): for index, row in enumerate(rows): _process_vcard_row(row, path, data) - print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") + logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") + logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty): @@ -777,7 +782,7 @@ def calls(db, data, timezone_offset, filter_chat): if total_row_number == 0: return - print(f"\nProcessing calls...({total_row_number})", end="\r") + logger.info(f"Processing calls...({total_row_number})\r") # Fetch call data calls_data = _fetch_calls_data(c, filter_chat) @@ -793,6 +798,7 @@ def calls(db, data, timezone_offset, filter_chat): # Add the calls chat to the data data.add_chat("000000000000000", chat) + logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}") def _get_calls_count(c, filter_chat): @@ -917,7 +923,7 @@ def create_html( template = setup_template(template, no_avatar, experimental) total_row_number = len(data) - print(f"\nGenerating chats...(0/{total_row_number})", end="\r") + logger.info(f"Generating chats...(0/{total_row_number})\r") # Create output directory if it doesn't exist if not os.path.isdir(output_folder): @@ -958,9 +964,9 @@ def create_html( ) if current % 10 == 0: - print(f"Generating chats...({current}/{total_row_number})", end="\r") + logger.info(f"Generating chats...({current}/{total_row_number})\r") - print(f"Generating chats...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Generated {total_row_number} chats{CLEAR_LINE}") def _generate_single_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, headline): diff --git a/Whatsapp_Chat_Exporter/exported_handler.py b/Whatsapp_Chat_Exporter/exported_handler.py index 7215f6f..4a81adb 100644 --- a/Whatsapp_Chat_Exporter/exported_handler.py +++ b/Whatsapp_Chat_Exporter/exported_handler.py @@ -1,10 +1,14 @@ #!/usr/bin/python3 import os +import logging from datetime import datetime from mimetypes import MimeTypes from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import Device +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device + + +logger = logging.getLogger(__name__) def messages(path, data, assume_first_as_me=False): @@ -38,9 +42,9 @@ def messages(path, data, assume_first_as_me=False): # Show progress if index % 1000 == 0: - print(f"Processing messages & media...({index}/{total_row_number})", end="\r") + logger.info(f"Processing messages & media...({index}/{total_row_number})\r") - print(f"Processing messages & media...({total_row_number}/{total_row_number})") + logger.info(f"Processed {total_row_number} messages & media{CLEAR_LINE}") return data diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 7a15835..a833c7f 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -1,22 +1,26 @@ #!/usr/bin/python3 import os +import logging import shutil from glob import glob from pathlib import Path from mimetypes import MimeTypes from markupsafe import escape as htmle from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CURRENT_TZ_OFFSET, get_chat_condition +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device +logger = logging.getLogger(__name__) + + def contacts(db, data): """Process WhatsApp contacts with status information.""" c = db.cursor() c.execute("""SELECT count() FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") total_row_number = c.fetchone()[0] - print(f"Pre-processing contacts...({total_row_number})") + logger.info(f"Pre-processing contacts...({total_row_number})\r") c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") content = c.fetchone() @@ -29,6 +33,7 @@ def contacts(db, data): current_chat.status = content["ZABOUTTEXT"] data.add_chat(zwhatsapp_id, current_chat) content = c.fetchone() + logger.info(f"Pre-processed {total_row_number} contacts{CLEAR_LINE}") def process_contact_avatars(current_chat, media_folder, contact_id): @@ -85,7 +90,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c.execute(contact_query) total_row_number = c.fetchone()[0] - print(f"Processing contacts...({total_row_number})") + logger.info(f"Processing contacts...({total_row_number})\r") # Get distinct contacts contacts_query = f""" @@ -123,6 +128,8 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, # Process avatar images process_contact_avatars(current_chat, media_folder, contact_id) content = c.fetchone() + + logger.info(f"Processed {total_row_number} contacts{CLEAR_LINE}") # Get message count message_count_query = f""" @@ -139,7 +146,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c.execute(message_count_query) total_row_number = c.fetchone()[0] - print(f"Processing messages...(0/{total_row_number})", end="\r") + logger.info(f"Processing messages...(0/{total_row_number})\r") # Fetch messages messages_query = f""" @@ -207,10 +214,9 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, # Update progress i += 1 if i % 1000 == 0: - print(f"Processing messages...({i}/{total_row_number})", end="\r") + logger.info(f"Processing messages...({i}/{total_row_number})\r") content = c.fetchone() - - print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") def process_message_data(message, content, is_group_message, data, cursor2): @@ -329,7 +335,7 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa """ c.execute(media_count_query) total_row_number = c.fetchone()[0] - print(f"\nProcessing media...(0/{total_row_number})", end="\r") + logger.info(f"Processing media...(0/{total_row_number})\r") # Fetch media items media_query = f""" @@ -365,10 +371,9 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa # Update progress i += 1 if i % 100 == 0: - print(f"Processing media...({i}/{total_row_number})", end="\r") + logger.info(f"Processing media...({i}/{total_row_number})\r") content = c.fetchone() - - print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") def process_media_item(content, data, media_folder, mime, separate_media): @@ -444,7 +449,7 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): c.execute(vcard_query) contents = c.fetchall() total_row_number = len(contents) - print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") + logger.info(f"Processing vCards...(0/{total_row_number})\r") # Create vCards directory path = f'{media_folder}/Message/vCards' @@ -453,7 +458,8 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): # Process each vCard for index, content in enumerate(contents): process_vcard_item(content, path, data) - print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") + logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") + logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") def process_vcard_item(content, path, data): @@ -510,7 +516,7 @@ def calls(db, data, timezone_offset, filter_chat): if total_row_number == 0: return - print(f"\nProcessing calls...({total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}\n") # Fetch call records calls_query = f""" diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index a1dcd30..a3c26e2 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 +import logging import shutil import sqlite3 import os import getpass from sys import exit -from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier from Whatsapp_Chat_Exporter.bplist import BPListReader try: from iphone_backup_decrypt import EncryptedBackup, RelativePath @@ -15,6 +16,8 @@ else: support_encrypted = True +logger = logging.getLogger(__name__) + class BackupExtractor: """ A class to handle the extraction of WhatsApp data from iOS backups, @@ -57,12 +60,13 @@ class BackupExtractor: Handles the extraction of data from an encrypted iOS backup. """ if not support_encrypted: - print("You don't have the dependencies to handle encrypted backup.") - print("Read more on how to deal with encrypted backup:") - print("https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage") + logger.error("You don't have the dependencies to handle encrypted backup." + "Read more on how to deal with encrypted backup:" + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" + ) return - print("Encryption detected on the backup!") + logger.info(f"Encryption detected on the backup!{CLEAR_LINE}") password = getpass.getpass("Enter the password for the backup:") self._decrypt_backup(password) self._extract_decrypted_files() @@ -74,7 +78,7 @@ class BackupExtractor: Args: password (str): The password for the encrypted backup. """ - print("Trying to decrypt the iOS backup...", end="") + logger.info(f"Trying to decrypt the iOS backup...{CLEAR_LINE}") self.backup = EncryptedBackup( backup_directory=self.base_dir, passphrase=password, @@ -82,7 +86,8 @@ class BackupExtractor: check_same_thread=False, decrypt_chunk_size=self.decrypt_chunk_size, ) - print("Done\nDecrypting WhatsApp database...", end="") + logger.info(f"iOS backup decrypted successfully!{CLEAR_LINE}") + logger.info("Decrypting WhatsApp database...\n") try: self.backup.extract_file( relative_path=RelativePath.WHATSAPP_MESSAGES, @@ -100,23 +105,23 @@ class BackupExtractor: output_filename=self.identifiers.CALL, ) except ValueError: - print("Failed to decrypt backup: incorrect password?") + logger.error("Failed to decrypt backup: incorrect password?") exit(7) except FileNotFoundError: - print( + logger.error( "Essential WhatsApp files are missing from the iOS backup. " "Perhapse you enabled end-to-end encryption for the backup? " "See https://wts.knugi.dev/docs.html?dest=iose2e" ) exit(6) else: - print("Done") + logger.info(f"Done{CLEAR_LINE}") def _extract_decrypted_files(self): """Extract all WhatsApp files after decryption""" def extract_progress_handler(file_id, domain, relative_path, n, total_files): if n % 100 == 0: - print(f"Decrypting and extracting files...({n}/{total_files})", end="\r") + logger.info(f"Decrypting and extracting files...({n}/{total_files})\r") return True self.backup.extract_files( @@ -125,7 +130,7 @@ class BackupExtractor: preserve_folders=True, filter_callback=extract_progress_handler ) - print(f"All required files are decrypted and extracted. ", end="\n") + logger.info(f"All required files are decrypted and extracted.{CLEAR_LINE}") def _extract_unencrypted_backup(self): """ @@ -144,10 +149,10 @@ class BackupExtractor: if not os.path.isfile(wts_db_path): if self.identifiers is WhatsAppIdentifier: - print("WhatsApp database not found.") + logger.error("WhatsApp database not found.") else: - print("WhatsApp Business database not found.") - print( + logger.error("WhatsApp Business database not found.") + logger.error( "Essential WhatsApp files are missing from the iOS backup. " "Perhapse you enabled end-to-end encryption for the backup? " "See https://wts.knugi.dev/docs.html?dest=iose2e" @@ -157,12 +162,12 @@ class BackupExtractor: shutil.copyfile(wts_db_path, self.identifiers.MESSAGE) if not os.path.isfile(contact_db_path): - print("Contact database not found. Skipping...") + logger.warning(f"Contact database not found. Skipping...{CLEAR_LINE}") else: shutil.copyfile(contact_db_path, self.identifiers.CONTACT) if not os.path.isfile(call_db_path): - print("Call database not found. Skipping...") + logger.warning(f"Call database not found. Skipping...{CLEAR_LINE}") else: shutil.copyfile(call_db_path, self.identifiers.CALL) @@ -176,7 +181,7 @@ class BackupExtractor: c = manifest.cursor() c.execute(f"SELECT count() FROM Files WHERE domain = '{_wts_id}'") total_row_number = c.fetchone()[0] - print(f"Extracting WhatsApp files...(0/{total_row_number})", end="\r") + logger.info(f"Extracting WhatsApp files...(0/{total_row_number})\r") c.execute( f""" SELECT fileID, relativePath, flags, file AS metadata, @@ -213,9 +218,9 @@ class BackupExtractor: os.utime(destination, (modification, modification)) if row["_index"] % 100 == 0: - print(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})", end="\r") + logger.info(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})\r") row = c.fetchone() - print(f"Extracting WhatsApp files...({total_row_number}/{total_row_number})", end="\n") + logger.info(f"Extracted WhatsApp files...({total_row_number}){CLEAR_LINE}") def extract_media(base_dir, identifiers, decrypt_chunk_size): diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index b6f7bc2..2a1e22c 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -1,3 +1,4 @@ +import logging import sqlite3 import jinja2 import json @@ -28,7 +29,9 @@ except ImportError: MAX_SIZE = 4 * 1024 * 1024 # Default 4MB ROW_SIZE = 0x3D0 CURRENT_TZ_OFFSET = datetime.now().astimezone().utcoffset().seconds / 3600 +CLEAR_LINE = "\x1b[K\n" +logger = logging.getLogger(__name__) def convert_time_unit(time_second: int) -> str: """Converts a time duration in seconds to a human-readable string. @@ -151,7 +154,7 @@ def check_update(): try: raw = urllib.request.urlopen(PACKAGE_JSON) except Exception: - print("Failed to check for updates.") + logger.error("Failed to check for updates.") return 1 else: with raw: @@ -161,17 +164,19 @@ def check_update(): __version__ = importlib.metadata.version("whatsapp_chat_exporter") current_version = tuple(map(int, __version__.split("."))) if current_version < latest_version: - print("===============Update===============") - print("A newer version of WhatsApp Chat Exporter is available.") - print("Current version: " + __version__) - print("Latest version: " + package_info["info"]["version"]) + logger.info( + "===============Update===============\n" + "A newer version of WhatsApp Chat Exporter is available.\n" + f"Current version: {__version__}\n" + f"Latest version: {package_info['info']['version']}\n" + ) if platform == "win32": - print("Update with: pip install --upgrade whatsapp-chat-exporter") + logger.info("Update with: pip install --upgrade whatsapp-chat-exporter\n") else: - print("Update with: pip3 install --upgrade whatsapp-chat-exporter") - print("====================================") + logger.info("Update with: pip3 install --upgrade whatsapp-chat-exporter\n") + logger.info("====================================\n") else: - print("You are using the latest version of WhatsApp Chat Exporter.") + logger.info("You are using the latest version of WhatsApp Chat Exporter.\n") return 0 @@ -229,7 +234,7 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): with open(json_file, "r") as f: temp_data = json.loads(f.read()) total_row_number = len(tuple(temp_data.keys())) - print(f"Importing chats from JSON...(0/{total_row_number})", end="\r") + logger.info(f"Importing chats from JSON...(0/{total_row_number})\r") for index, (jid, chat_data) in enumerate(temp_data.items()): chat = ChatStore(chat_data.get("type"), chat_data.get("name")) chat.my_avatar = chat_data.get("my_avatar") @@ -258,8 +263,9 @@ def import_from_json(json_file: str, data: Dict[str, ChatStore]): message.sticker = msg.get("sticker") chat.add_message(id, message) data[jid] = chat - print( - f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") + logger.info( + f"Importing chats from JSON...({index + 1}/{total_row_number})\r") + logger.info(f"Imported chats from JSON...({total_row_number}){CLEAR_LINE}") def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): @@ -272,21 +278,21 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p """ json_files = [f for f in os.listdir(source_dir) if f.endswith('.json')] if not json_files: - print("No JSON files found in the source directory.") + logger.error("No JSON files found in the source directory.") return - print("JSON files found:", json_files) + logger.info("JSON files found:", json_files) for json_file in json_files: source_path = os.path.join(source_dir, json_file) target_path = os.path.join(target_dir, json_file) if not os.path.exists(target_path): - print(f"Copying '{json_file}' to target directory...") + logger.info(f"Copying '{json_file}' to target directory...") os.makedirs(target_dir, exist_ok=True) shutil.copy2(source_path, target_path) else: - print( + logger.info( f"Merging '{json_file}' with existing file in target directory...") with open(source_path, 'r') as src_file, open(target_path, 'r') as tgt_file: source_data = json.load(src_file) @@ -311,7 +317,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p # Check if the merged data differs from the original target data if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): - print( + logger.info( f"Changes detected in '{json_file}', updating target file...") with open(target_path, 'w') as merged_file: json.dump( @@ -321,13 +327,13 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p ensure_ascii=not avoid_encoding_json, ) else: - print( + logger.info( f"No changes detected in '{json_file}', skipping update.") # Merge media directories source_media_path = os.path.join(source_dir, media_dir) target_media_path = os.path.join(target_dir, media_dir) - print( + logger.info( f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") if os.path.exists(source_media_path): for root, _, files in os.walk(source_media_path): @@ -339,7 +345,7 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p target_file = os.path.join(target_root, file) # we only copy if the file doesn't exist in the target or if the source is newer if not os.path.exists(target_file) or os.path.getmtime(source_file) > os.path.getmtime(target_file): - print(f"Copying '{source_file}' to '{target_file}'...") + logger.info(f"Copying '{source_file}' to '{target_file}'...") shutil.copy2(source_file, target_file) From 75c3999567d2e67e2413782cfb67495e85659469 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 16:56:19 +0800 Subject: [PATCH 041/108] Update debug log name --- Whatsapp_Chat_Exporter/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 9279812..e883637 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -719,7 +719,8 @@ def setup_logging(level): log_handler_stdout.terminator = "" handlers = [log_handler_stdout] if level == logging.DEBUG: - handlers.append(logging.FileHandler("debug.log", mode="w")) + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + handlers.append(logging.FileHandler(f"wtsexpoter-debug-{timestamp}.log", mode="w")) logging.basicConfig( level=level, format="[%(levelname)s] %(message)s", From dd795f3282e3049b65ce6a14bf2d58b93f70fcab Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:27:23 +0800 Subject: [PATCH 042/108] Adjust banner position --- Whatsapp_Chat_Exporter/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index e883637..807279a 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -48,8 +48,8 @@ WTSEXPORTER_BANNER = f"""======================================================= ╚██████╗██║ ██║██║ ██║ ██║ ███████╗██╔╝ ██╗██║ ╚██████╔╝██║ ██║ ██║ ███████╗██║ ██║ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ - WhatsApp Chat Exporter: A customizable Android and iOS/iPadOS WhatsApp database parser - Version: {__version__} + WhatsApp Chat Exporter: A customizable Android and iOS/iPadOS WhatsApp database parser + Version: {__version__} ========================================================================================================""" From 1eea5fc5c1ad86f00a68129351e54841a2f321b0 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:29:24 +0800 Subject: [PATCH 043/108] Use the new chat importing method from data_model This commit also fixes #150 --- Whatsapp_Chat_Exporter/utility.py | 36 +++++-------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 2a1e22c..08ea6b0 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -11,7 +11,7 @@ from bleach import clean as sanitize from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum -from Whatsapp_Chat_Exporter.data_model import ChatStore +from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore from typing import Dict, List, Optional, Tuple try: from enum import StrEnum, IntEnum @@ -223,49 +223,23 @@ class Device(StrEnum): EXPORTED = "exported" -def import_from_json(json_file: str, data: Dict[str, ChatStore]): +def import_from_json(json_file: str, data: ChatCollection): """Imports chat data from a JSON file into the data dictionary. Args: json_file: The path to the JSON file. data: The dictionary to store the imported chat data. """ - from Whatsapp_Chat_Exporter.data_model import ChatStore, Message with open(json_file, "r") as f: temp_data = json.loads(f.read()) total_row_number = len(tuple(temp_data.keys())) logger.info(f"Importing chats from JSON...(0/{total_row_number})\r") for index, (jid, chat_data) in enumerate(temp_data.items()): - chat = ChatStore(chat_data.get("type"), chat_data.get("name")) - chat.my_avatar = chat_data.get("my_avatar") - chat.their_avatar = chat_data.get("their_avatar") - chat.their_avatar_thumb = chat_data.get("their_avatar_thumb") - chat.status = chat_data.get("status") - for id, msg in chat_data.get("messages").items(): - message = Message( - from_me=msg["from_me"], - timestamp=msg["timestamp"], - time=msg["time"], - key_id=msg["key_id"], - received_timestamp=msg.get("received_timestamp"), - read_timestamp=msg.get("read_timestamp") - ) - message.media = msg.get("media") - message.meta = msg.get("meta") - message.data = msg.get("data") - message.sender = msg.get("sender") - message.safe = msg.get("safe") - message.mime = msg.get("mime") - message.reply = msg.get("reply") - message.quoted_data = msg.get("quoted_data") - message.caption = msg.get("caption") - message.thumb = msg.get("thumb") - message.sticker = msg.get("sticker") - chat.add_message(id, message) - data[jid] = chat + chat = ChatStore.from_json(chat_data) + data.add_chat(jid, chat) logger.info( f"Importing chats from JSON...({index + 1}/{total_row_number})\r") - logger.info(f"Imported chats from JSON...({total_row_number}){CLEAR_LINE}") + logger.info(f"Imported {total_row_number} chats from JSON{CLEAR_LINE}") def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): From ae6e8ba7e2d67f2188a282c5daa68b06f5c8efdf Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:46:00 +0800 Subject: [PATCH 044/108] Make to_ & from_json functions dynamic This is to prevent error like #150 in the future --- Whatsapp_Chat_Exporter/data_model.py | 59 ++++++++-------------------- 1 file changed, 16 insertions(+), 43 deletions(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 1ebf75d..63a3252 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -207,26 +207,21 @@ class ChatStore: def to_json(self) -> Dict[str, Any]: """Convert chat store to JSON-serializable dict.""" - return { - 'name': self.name, - 'type': self.type, - 'my_avatar': self.my_avatar, - 'their_avatar': self.their_avatar, - 'their_avatar_thumb': self.their_avatar_thumb, - 'status': self.status, - 'media_base': self.media_base, - 'messages': {id: msg.to_json() for id, msg in self._messages.items()} + json_dict = { + key: value + for key, value in self.__dict__.items() + if key != '_messages' } + json_dict['messages'] = {id: msg.to_json() for id, msg in self._messages.items()} + return json_dict @classmethod def from_json(cls, data: Dict) -> 'ChatStore': """Create a chat store from JSON data.""" chat = cls(data.get("type"), data.get("name")) - chat.my_avatar = data.get("my_avatar") - chat.their_avatar = data.get("their_avatar") - chat.their_avatar_thumb = data.get("their_avatar_thumb") - chat.status = data.get("status") - chat.media_base = data.get("media_base") + for key, value in data.items(): + if hasattr(chat, key) and key not in ("messages", "type", "name"): + setattr(chat, key, value) for id, msg_data in data.get("messages", {}).items(): message = Message.from_json(msg_data) chat.add_message(id, message) @@ -347,24 +342,8 @@ class Message: def to_json(self) -> Dict[str, Any]: """Convert message to JSON-serializable dict.""" return { - 'from_me': self.from_me, - 'timestamp': self.timestamp, - 'time': self.time, - 'media': self.media, - 'key_id': self.key_id, - 'meta': self.meta, - 'data': self.data, - 'sender': self.sender, - 'safe': self.safe, - 'mime': self.mime, - 'reply': self.reply, - 'quoted_data': self.quoted_data, - 'caption': self.caption, - 'thumb': self.thumb, - 'sticker': self.sticker, - 'message_type': self.message_type, - 'received_timestamp': self.received_timestamp, - 'read_timestamp': self.read_timestamp + key: value + for key, value in self.__dict__.items() } @classmethod @@ -378,15 +357,9 @@ class Message: received_timestamp=data.get("received_timestamp"), read_timestamp=data.get("read_timestamp") ) - message.media = data.get("media") - message.meta = data.get("meta") - message.data = data.get("data") - message.sender = data.get("sender") - message.safe = data.get("safe") - message.mime = data.get("mime") - message.reply = data.get("reply") - message.quoted_data = data.get("quoted_data") - message.caption = data.get("caption") - message.thumb = data.get("thumb") - message.sticker = data.get("sticker") + added = ("from_me", "timestamp", "time", "key_id", "message_type", + "received_timestamp", "read_timestamp") + for key, value in data.items(): + if hasattr(message, key) and key not in added: + setattr(message, key, value) return message From bb48cd381b27579b5caf79931becefab19c4aa31 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:49:33 +0800 Subject: [PATCH 045/108] Fix test case where media_base should never be None --- tests/test_incremental_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index 39f6c19..5d6ef58 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -86,7 +86,7 @@ chat_data_merged = { "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), "their_avatar_thumb": None, "status": None, - "media_base": None, + "media_base": "", "messages": { "24690": { "from_me": True, From 667c005a6706d11e1ce5bf5094324f3d5a6cab10 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:49:51 +0800 Subject: [PATCH 046/108] Make received_ & read_timestamp optional --- Whatsapp_Chat_Exporter/data_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 63a3252..8747419 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -277,8 +277,8 @@ class Message: timestamp: int, time: Union[int, float, str], key_id: Union[int, str], - received_timestamp: int, - read_timestamp: int, + received_timestamp: int = None, + read_timestamp: int = None, timezone_offset: int = 0, message_type: Optional[int] = None ) -> None: @@ -290,8 +290,8 @@ class Message: timestamp (int): Message timestamp time (Union[int, float, str]): Message time key_id (int): Message unique identifier - received_timestamp (int): When message was received - read_timestamp (int): When message was read + received_timestamp (int, optional): When message was received. Defaults to None + read_timestamp (int, optional): When message was read. Defaults to None timezone_offset (int, optional): Hours offset from UTC. Defaults to 0 message_type (Optional[int], optional): Type of message. Defaults to None From e8acf6da32bef5f7ae999adeaef4cff60e9ccdb2 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 17:59:20 +0800 Subject: [PATCH 047/108] Fix key access in f-string for older Python --- Whatsapp_Chat_Exporter/android_crypt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py index 68f0120..cf7148b 100644 --- a/Whatsapp_Chat_Exporter/android_crypt.py +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -149,7 +149,7 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> pass # Try next offset else: logger.debug( - f"Decryption successful with known offsets: IV {offsets["iv"]}, DB {offsets["db"]}{CLEAR_LINE}" + f"Decryption successful with known offsets: IV {offsets['iv']}, DB {offsets['db']}{CLEAR_LINE}" ) return decrypted_db # Successful decryption From cc410b850329d84d9930056b7e8245db044fab97 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 18:01:25 +0800 Subject: [PATCH 048/108] Save the environment by reducing CI targets --- .github/workflows/ci.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9b9d528..7bb5d76 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,28 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macos-latest] python-version: [3.9, "3.10", "3.11", "3.12", "3.13"] + include: + - os: windows-latest + python-version: "3.13" + - os: macos-latest + python-version: "3.13" + exclude: + - os: windows-latest + python-version: 3.9 + - os: windows-latest + python-version: "3.10" + - os: windows-latest + python-version: "3.11" + - os: windows-latest + python-version: "3.12" + - os: macos-latest + python-version: 3.9 + - os: macos-latest + python-version: "3.10" + - os: macos-latest + python-version: "3.11" + - os: macos-latest + python-version: "3.12" steps: - name: Checkout code From 33149075d3314fb35db3a4f9f43fb14df2e542e5 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 18:07:51 +0800 Subject: [PATCH 049/108] autopep8 --- Whatsapp_Chat_Exporter/__main__.py | 9 +- Whatsapp_Chat_Exporter/android_crypt.py | 9 +- Whatsapp_Chat_Exporter/android_handler.py | 261 +++++++++++--------- Whatsapp_Chat_Exporter/bplist.py | 131 +++++----- Whatsapp_Chat_Exporter/exported_handler.py | 44 ++-- Whatsapp_Chat_Exporter/ios_handler.py | 130 +++++----- Whatsapp_Chat_Exporter/ios_media_handler.py | 12 +- Whatsapp_Chat_Exporter/utility.py | 3 +- scripts/brazilian_number_processing.py | 29 ++- scripts/bruteforce_crypt15.py | 1 + tests/test_brazilian_number_processing.py | 41 +-- tests/test_incremental_merge.py | 6 +- tests/test_nuitka_binary.py | 4 +- tests/test_vcards_contacts.py | 3 + 14 files changed, 372 insertions(+), 311 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 807279a..0a7aae3 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -452,7 +452,8 @@ def decrypt_android_backup(args) -> int: elif "crypt15" in args.backup: crypt = Crypt.CRYPT15 else: - logger.error(f"Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.{CLEAR_LINE}") + logger.error( + f"Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.{CLEAR_LINE}") return 1 # Get key @@ -505,11 +506,11 @@ def handle_decrypt_error(error: int) -> None: """Handle decryption errors with appropriate messages.""" if error == 1: logger.error("Dependencies of decrypt_backup and/or extract_encrypted_key" - " are not present. For details, see README.md.\n") + " are not present. For details, see README.md.\n") exit(3) elif error == 2: logger.error("Failed when decompressing the decrypted backup. " - "Possibly incorrect offsets used in decryption.\n") + "Possibly incorrect offsets used in decryption.\n") exit(4) else: logger.error("Unknown error occurred.\n") @@ -598,7 +599,7 @@ def handle_media_directory(args) -> None: logger.info(f"Media directory has been moved to the output directory{CLEAR_LINE}") except PermissionError: logger.warning("Cannot remove original WhatsApp directory. " - "Perhaps the directory is opened?\n") + "Perhaps the directory is opened?\n") else: logger.info(f"Copying media directory...\r") shutil.copytree(args.media, media_path) diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py index cf7148b..3e921d1 100644 --- a/Whatsapp_Chat_Exporter/android_crypt.py +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -121,6 +121,7 @@ def _decrypt_database(db_ciphertext: bytes, main_key: bytes, iv: bytes) -> bytes ) return db + def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> bytes: """Decrypt a crypt14 database using multithreading for brute-force offset detection. @@ -194,7 +195,8 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> return db with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: - future_to_offset = {executor.submit(attempt_decrypt, offset): offset for offset in offset_combinations} + future_to_offset = {executor.submit(attempt_decrypt, offset) + : offset for offset in offset_combinations} try: for future in concurrent.futures.as_completed(future_to_offset): @@ -217,7 +219,6 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> raise OffsetNotFoundError("Could not find the correct offsets for decryption.") - def _decrypt_crypt12(database: bytes, main_key: bytes) -> bytes: """Decrypt a crypt12 database. @@ -319,7 +320,7 @@ def decrypt_backup( if crypt is not Crypt.CRYPT15 and len(key) != 158: raise InvalidKeyError("The key file must be 158 bytes") - #signature check, this is check is used in crypt 12 and 14 + # signature check, this is check is used in crypt 12 and 14 if crypt != Crypt.CRYPT15: t1 = key[30:62] @@ -329,7 +330,6 @@ def decrypt_backup( if t1 != database[3:35] and crypt == Crypt.CRYPT12: raise ValueError("The signature of key file and backup file mismatch") - if crypt == Crypt.CRYPT15: if keyfile_stream: main_key, hex_key = _extract_enc_key(key) @@ -353,7 +353,6 @@ def decrypt_backup( except (InvalidFileFormatError, OffsetNotFoundError, ValueError) as e: raise DecryptionError(f"Decryption failed: {e}") from e - if not dry_run: with open(output, "wb") as f: f.write(db) diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index cd364e7..d371b4f 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -22,24 +22,26 @@ logger = logging.getLogger(__name__) def contacts(db, data, enrich_from_vcards): """ Process WhatsApp contacts from the database. - + Args: db: Database connection data: Data store object enrich_from_vcards: Path to vCard file for contact enrichment - + Returns: bool: False if no contacts found, True otherwise """ c = db.cursor() c.execute("SELECT count() FROM wa_contacts") total_row_number = c.fetchone()[0] - + if total_row_number == 0: if enrich_from_vcards is not None: - logger.info("No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") + logger.info( + "No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") else: - logger.warning("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") + logger.warning( + "No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") return False else: logger.info(f"Processed {total_row_number} contacts\n") @@ -51,14 +53,14 @@ def contacts(db, data, enrich_from_vcards): if row["status"] is not None: current_chat.status = row["status"] row = c.fetchone() - + return True def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty): """ Process WhatsApp messages from the database. - + Args: db: Database connection data: Data store object @@ -85,17 +87,17 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, i = 0 # Fetch the first row safely content = _fetch_row_safely(content_cursor) - + while content is not None: _process_single_message(data, content, table_message, timezone_offset) - + i += 1 if i % 1000 == 0: logger.info(f"Processing messages...({i}/{total_row_number})\r") - + # Fetch the next row safely content = _fetch_row_safely(content_cursor) - + logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") @@ -106,8 +108,10 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): try: empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") cursor.execute(f"""SELECT count() FROM messages @@ -123,8 +127,10 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): except sqlite3.OperationalError: empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT count() FROM message @@ -146,8 +152,10 @@ def _get_messages_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): """Get cursor for legacy database schema.""" empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") cursor.execute(f"""SELECT messages.key_remote_jid, messages._id, @@ -209,8 +217,10 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): """Get cursor for new database schema.""" empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") cursor.execute(f"""SELECT jid_global.raw_string as key_remote_jid, message._id, @@ -292,19 +302,20 @@ def _process_single_message(data, content, table_message, timezone_offset): """Process a single message row.""" if content["key_remote_jid"] is None: return - + # Get or create the chat if not data.get_chat(content["key_remote_jid"]): - current_chat = data.add_chat(content["key_remote_jid"], ChatStore(Device.ANDROID, content["chat_subject"])) + current_chat = data.add_chat(content["key_remote_jid"], ChatStore( + Device.ANDROID, content["chat_subject"])) else: current_chat = data.get_chat(content["key_remote_jid"]) - + # Determine sender_jid_row_id if "sender_jid_row_id" in content: sender_jid_row_id = content["sender_jid_row_id"] else: sender_jid_row_id = None - + # Create message object message = Message( from_me=not sender_jid_row_id and content["key_from_me"], @@ -316,19 +327,19 @@ def _process_single_message(data, content, table_message, timezone_offset): received_timestamp=content["received_timestamp"], read_timestamp=content["read_timestamp"] ) - + # Handle binary data if isinstance(content["data"], bytes): _process_binary_message(message, content) current_chat.add_message(content["_id"], message) return - + # Set sender for group chats if content["jid_type"] == JidType.GROUP and content["key_from_me"] == 0: _set_group_sender(message, content, data, table_message) else: message.sender = None - + # Handle quoted messages if content["quoted"] is not None: message.reply = content["quoted"] @@ -338,7 +349,7 @@ def _process_single_message(data, content, table_message, timezone_offset): message.quoted_data = content["quoted_data"] else: message.reply = None - + # Handle message caption if not table_message and content["media_caption"] is not None: # Old schema @@ -348,14 +359,14 @@ def _process_single_message(data, content, table_message, timezone_offset): message.caption = content["data"] else: message.caption = None - + # Handle message content based on status if content["status"] == 6: # 6 = Metadata _process_metadata_message(message, content, data, table_message) else: # Real message _process_regular_message(message, content, table_message) - + current_chat.add_message(content["_id"], message) @@ -385,7 +396,7 @@ def _set_group_sender(message, content, data, table_message): name = data.get_chat(content["remote_resource"]).name if "@" in content["remote_resource"]: fallback = content["remote_resource"].split('@')[0] - + message.sender = name or fallback @@ -393,7 +404,7 @@ def _process_metadata_message(message, content, data, table_message): """Process metadata message.""" message.meta = True name = fallback = None - + if table_message: if content["sender_jid_row_id"] > 0: _jid = content["group_sender_jid"] @@ -412,12 +423,12 @@ def _process_metadata_message(message, content, data, table_message): fallback = _jid.split('@')[0] else: name = "You" - + message.data = determine_metadata(content, name or fallback) - + if isinstance(message.data, str) and "
" in message.data: message.safe = True - + if message.data is None: if content["video_call"] is not None: # Missed call message.meta = True @@ -433,7 +444,7 @@ def _process_metadata_message(message, content, data, table_message): def _process_regular_message(message, content, table_message): """Process regular (non-metadata) message.""" message.sticker = content["media_wa_type"] == 20 # Sticker is a message - + if content["key_from_me"] == 1: if content["status"] == 5 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: msg = "Message deleted" @@ -458,7 +469,7 @@ def _process_regular_message(message, content, table_message): msg = content["data"] if msg is not None: msg = _format_message_text(msg) - + message.data = msg @@ -474,7 +485,7 @@ def _format_message_text(text): def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True): """ Process WhatsApp media files from the database. - + Args: db: Database connection data: Data store object @@ -487,28 +498,28 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa c = db.cursor() total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat) logger.info(f"Processing media...(0/{total_row_number})\r") - + try: content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat) except sqlite3.OperationalError: content_cursor = _get_media_cursor_new(c, filter_empty, filter_date, filter_chat) - + content = content_cursor.fetchone() mime = MimeTypes() - + # Ensure thumbnails directory exists Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True) - + i = 0 while content is not None: _process_single_media(data, content, media_folder, mime, separate_media) - + i += 1 if i % 100 == 0: logger.info(f"Processing media...({i}/{total_row_number})\r") - + content = content_cursor.fetchone() - + logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") @@ -519,8 +530,10 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): try: empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") cursor.execute(f"""SELECT count() FROM message_media @@ -538,8 +551,10 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): except sqlite3.OperationalError: empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT count() FROM message_media @@ -563,8 +578,10 @@ def _get_media_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): """Get cursor for legacy media database schema.""" empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") cursor.execute(f"""SELECT messages.key_remote_jid, message_row_id, @@ -596,8 +613,10 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): """Get cursor for new media database schema.""" empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT jid.raw_string as key_remote_jid, message_row_id, @@ -633,10 +652,10 @@ def _process_single_media(data, content, media_folder, mime, separate_media): current_chat = data.get_chat(content["key_remote_jid"]) message = current_chat.get_message(content["message_row_id"]) message.media = True - + if os.path.isfile(file_path): message.data = file_path - + # Set mime type if content["mime_type"] is None: guess = mime.guess_type(file_path)[0] @@ -646,11 +665,11 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.mime = "application/octet-stream" else: message.mime = content["mime_type"] - + # Copy media to separate folder if needed if separate_media: - chat_display_name = slugify(current_chat.name or message.sender - or content["key_remote_jid"].split('@')[0], True) + chat_display_name = slugify(current_chat.name or message.sender + or content["key_remote_jid"].split('@')[0], True) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) @@ -661,7 +680,7 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.data = "The media is missing" message.mime = "media" message.meta = True - + # Handle thumbnail if content["thumbnail"] is not None: thumb_path = f"{media_folder}/thumbnails/{b64decode(content['file_hash']).hex()}.png" @@ -681,11 +700,11 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): total_row_number = len(rows) logger.info(f"Processing vCards...(0/{total_row_number})\r") - + # Create vCards directory if it doesn't exist path = os.path.join(media_folder, "vCards") Path(path).mkdir(parents=True, exist_ok=True) - + for index, row in enumerate(rows): _process_vcard_row(row, path, data) logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") @@ -696,8 +715,10 @@ def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty): """Execute vCard query for modern WhatsApp database schema.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") @@ -726,8 +747,10 @@ def _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty): """Execute vCard query for legacy WhatsApp database schema.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") @@ -760,11 +783,11 @@ def _process_vcard_row(row, path, data): file_name = "".join(x for x in media_name if x.isalnum()) file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') file_path = os.path.join(path, f"{file_name}.vcf") - + if not os.path.isfile(file_path): with open(file_path, "w", encoding="utf-8") as f: f.write(row["vcard"]) - + message = data.get_chat(row["key_remote_jid"]).get_message(row["message_row_id"]) message.data = "This media include the following vCard file(s):
" \ f'{htmle(media_name)}' @@ -776,26 +799,26 @@ def _process_vcard_row(row, path, data): def calls(db, data, timezone_offset, filter_chat): """Process call logs from WhatsApp database.""" c = db.cursor() - + # Check if there are any calls that match the filter total_row_number = _get_calls_count(c, filter_chat) if total_row_number == 0: return - + logger.info(f"Processing calls...({total_row_number})\r") - + # Fetch call data calls_data = _fetch_calls_data(c, filter_chat) - + # Create a chat store for all calls chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - + # Process each call content = calls_data.fetchone() while content is not None: _process_call_record(content, chat, data, timezone_offset) content = calls_data.fetchone() - + # Add the calls chat to the data data.add_chat("000000000000000", chat) logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}") @@ -861,7 +884,7 @@ def _process_call_record(content, chat, data, timezone_offset): received_timestamp=None, # TODO: Add timestamp read_timestamp=None # TODO: Add timestamp ) - + # Get caller/callee name _jid = content["raw_string"] name = data.get_chat(_jid).name if _jid in data else content["chat_subject"] or None @@ -870,13 +893,13 @@ def _process_call_record(content, chat, data, timezone_offset): else: fallback = None call.sender = name or fallback - + # Set metadata call.meta = True - + # Construct call description based on call type and result call.data = _construct_call_description(content, call) - + # Add call to chat chat.add_message(content["_id"], call) @@ -888,7 +911,7 @@ def _construct_call_description(content, call): f"call {'to' if call.from_me else 'from'} " f"{call.sender} was " ) - + if content['call_result'] in (0, 4, 7): description += "cancelled." if call.from_me else "missed." elif content['call_result'] == 2: @@ -904,21 +927,21 @@ def _construct_call_description(content, call): ) else: description += "in an unknown state." - + return description def create_html( - data, - output_folder, - template=None, - embedded=False, - offline_static=False, - maximum_size=None, - no_avatar=False, - experimental=False, - headline=None - ): + data, + output_folder, + template=None, + embedded=False, + offline_static=False, + maximum_size=None, + no_avatar=False, + experimental=False, + headline=None +): """Generate HTML chat files from data.""" template = setup_template(template, no_avatar, experimental) @@ -936,33 +959,33 @@ def create_html( if len(current_chat) == 0: # Skip empty chats continue - + safe_file_name, name = get_file_name(contact, current_chat) if maximum_size is not None: _generate_paginated_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, - maximum_size, + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + maximum_size, headline ) else: _generate_single_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, headline ) - + if current % 10 == 0: logger.info(f"Generating chats...({current}/{total_row_number})\r") @@ -990,20 +1013,20 @@ def _generate_paginated_chat(current_chat, safe_file_name, name, contact, output current_size = 0 current_page = 1 render_box = [] - + # Use default maximum size if set to 0 if maximum_size == 0: maximum_size = MAX_SIZE - + last_msg = current_chat.get_last_message().key_id - + for message in current_chat.values(): # Calculate message size if message.data is not None and not message.meta and not message.media: current_size += len(message.data) + ROW_SIZE else: current_size += ROW_SIZE + 100 # Assume media and meta HTML are 100 bytes - + if current_size > maximum_size: # Create a new page output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" @@ -1047,25 +1070,25 @@ def _generate_paginated_chat(current_chat, safe_file_name, name, contact, output def create_txt(data, output): """Generate text files from chat data.""" os.makedirs(output, exist_ok=True) - + for jik, chat in data.items(): if len(chat) == 0: continue - + # Determine file name if chat.name is not None: contact = chat.name.replace('/', '') else: contact = jik.replace('+', '') - + output_file = os.path.join(output, f"{contact}.txt") - + with open(output_file, "w", encoding="utf8") as f: for message in chat.values(): # Skip metadata in text format if message.meta and message.mime != "media": continue - + # Format the message formatted_message = _format_message_for_txt(message, contact) f.write(f"{formatted_message}\n") @@ -1074,16 +1097,16 @@ def create_txt(data, output): def _format_message_for_txt(message, contact): """Format a message for text output.""" date = datetime.fromtimestamp(message.timestamp).date() - + # Determine the sender name if message.from_me: name = "You" else: name = message.sender if message.sender else contact - + prefix = f"[{date} {message.time}] {name}: " prefix_length = len(prefix) - + # Handle different message types if message.media and ("/" in message.mime or message.mime == "media"): if message.data == "The media is missing": @@ -1095,9 +1118,9 @@ def _format_message_for_txt(message, contact): message_text = "" else: message_text = message.data.replace('
', f'\n{" " * prefix_length}') - + # Add caption if present if message.caption is not None: message_text += "\n" + ' ' * len(prefix) + message.caption.replace('
', f'\n{" " * prefix_length}') - + return f"{prefix}{message_text}" diff --git a/Whatsapp_Chat_Exporter/bplist.py b/Whatsapp_Chat_Exporter/bplist.py index 390fe6e..4ca572a 100644 --- a/Whatsapp_Chat_Exporter/bplist.py +++ b/Whatsapp_Chat_Exporter/bplist.py @@ -24,31 +24,32 @@ import struct import codecs from datetime import datetime, timedelta + class BPListWriter(object): def __init__(self, objects): self.bplist = "" self.objects = objects - + def binary(self): '''binary -> string - + Generates bplist ''' self.data = 'bplist00' - + # TODO: flatten objects and count max length size - + # TODO: write objects and save offsets - + # TODO: write offsets - + # TODO: write metadata - + return self.data - + def write(self, filename): ''' - + Writes bplist to file ''' if self.bplist != "": @@ -57,18 +58,19 @@ class BPListWriter(object): else: raise Exception('BPlist not yet generated') + class BPListReader(object): def __init__(self, s): self.data = s self.objects = [] self.resolved = {} - + def __unpackIntStruct(self, sz, s): '''__unpackIntStruct(size, string) -> int - + Unpacks the integer of given size (1, 2 or 4 bytes) from string ''' - if sz == 1: + if sz == 1: ot = '!B' elif sz == 2: ot = '!H' @@ -79,17 +81,17 @@ class BPListReader(object): else: raise Exception('int unpack size '+str(sz)+' unsupported') return struct.unpack(ot, s)[0] - + def __unpackInt(self, offset): '''__unpackInt(offset) -> int - + Unpacks int field from plist at given offset ''' return self.__unpackIntMeta(offset)[1] def __unpackIntMeta(self, offset): '''__unpackIntMeta(offset) -> (size, int) - + Unpacks int field from plist at given offset and returns its size and value ''' obj_header = self.data[offset] @@ -99,7 +101,7 @@ class BPListReader(object): def __resolveIntSize(self, obj_info, offset): '''__resolveIntSize(obj_info, offset) -> (count, offset) - + Calculates count of objref* array entries and returns count and offset to first element ''' if obj_info == 0x0F: @@ -112,10 +114,10 @@ class BPListReader(object): def __unpackFloatStruct(self, sz, s): '''__unpackFloatStruct(size, string) -> float - + Unpacks the float of given size (4 or 8 bytes) from string ''' - if sz == 4: + if sz == 4: ot = '!f' elif sz == 8: ot = '!d' @@ -125,7 +127,7 @@ class BPListReader(object): def __unpackFloat(self, offset): '''__unpackFloat(offset) -> float - + Unpacks float field from plist at given offset ''' obj_header = self.data[offset] @@ -135,70 +137,79 @@ class BPListReader(object): def __unpackDate(self, offset): td = int(struct.unpack(">d", self.data[offset+1:offset+9])[0]) - return datetime(year=2001,month=1,day=1) + timedelta(seconds=td) + return datetime(year=2001, month=1, day=1) + timedelta(seconds=td) def __unpackItem(self, offset): '''__unpackItem(offset) - + Unpacks and returns an item from plist ''' obj_header = self.data[offset] obj_type, obj_info = (obj_header & 0xF0), (obj_header & 0x0F) - if obj_type == 0x00: - if obj_info == 0x00: # null 0000 0000 + if obj_type == 0x00: + if obj_info == 0x00: # null 0000 0000 return None - elif obj_info == 0x08: # bool 0000 1000 // false + elif obj_info == 0x08: # bool 0000 1000 // false return False - elif obj_info == 0x09: # bool 0000 1001 // true + elif obj_info == 0x09: # bool 0000 1001 // true return True - elif obj_info == 0x0F: # fill 0000 1111 // fill byte - raise Exception("0x0F Not Implemented") # this is really pad byte, FIXME + elif obj_info == 0x0F: # fill 0000 1111 // fill byte + raise Exception("0x0F Not Implemented") # this is really pad byte, FIXME else: - raise Exception('unpack item type '+str(obj_header)+' at '+str(offset)+ 'failed') - elif obj_type == 0x10: # int 0001 nnnn ... // # of bytes is 2^nnnn, big-endian bytes + raise Exception('unpack item type '+str(obj_header)+' at '+str(offset) + 'failed') + elif obj_type == 0x10: # int 0001 nnnn ... // # of bytes is 2^nnnn, big-endian bytes return self.__unpackInt(offset) - elif obj_type == 0x20: # real 0010 nnnn ... // # of bytes is 2^nnnn, big-endian bytes + elif obj_type == 0x20: # real 0010 nnnn ... // # of bytes is 2^nnnn, big-endian bytes return self.__unpackFloat(offset) - elif obj_type == 0x30: # date 0011 0011 ... // 8 byte float follows, big-endian bytes + elif obj_type == 0x30: # date 0011 0011 ... // 8 byte float follows, big-endian bytes return self.__unpackDate(offset) - elif obj_type == 0x40: # data 0100 nnnn [int] ... // nnnn is number of bytes unless 1111 then int count follows, followed by bytes + # data 0100 nnnn [int] ... // nnnn is number of bytes unless 1111 then int count follows, followed by bytes + elif obj_type == 0x40: obj_count, objref = self.__resolveIntSize(obj_info, offset) - return self.data[objref:objref+obj_count] # XXX: we return data as str - elif obj_type == 0x50: # string 0101 nnnn [int] ... // ASCII string, nnnn is # of chars, else 1111 then int count, then bytes + return self.data[objref:objref+obj_count] # XXX: we return data as str + # string 0101 nnnn [int] ... // ASCII string, nnnn is # of chars, else 1111 then int count, then bytes + elif obj_type == 0x50: obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count] - elif obj_type == 0x60: # string 0110 nnnn [int] ... // Unicode string, nnnn is # of chars, else 1111 then int count, then big-endian 2-byte uint16_t + # string 0110 nnnn [int] ... // Unicode string, nnnn is # of chars, else 1111 then int count, then big-endian 2-byte uint16_t + elif obj_type == 0x60: obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count*2].decode('utf-16be') - elif obj_type == 0x80: # uid 1000 nnnn ... // nnnn+1 is # of bytes + elif obj_type == 0x80: # uid 1000 nnnn ... // nnnn+1 is # of bytes # FIXME: Accept as a string for now obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count] - elif obj_type == 0xA0: # array 1010 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + # array 1010 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xA0: obj_count, objref = self.__resolveIntSize(obj_info, offset) arr = [] for i in range(obj_count): - arr.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + arr.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) return arr - elif obj_type == 0xC0: # set 1100 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + # set 1100 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xC0: # XXX: not serializable via apple implementation - raise Exception("0xC0 Not Implemented") # FIXME: implement - elif obj_type == 0xD0: # dict 1101 nnnn [int] keyref* objref* // nnnn is count, unless '1111', then int count follows + raise Exception("0xC0 Not Implemented") # FIXME: implement + # dict 1101 nnnn [int] keyref* objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xD0: obj_count, objref = self.__resolveIntSize(obj_info, offset) keys = [] for i in range(obj_count): - keys.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + keys.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) values = [] objref += obj_count*self.object_ref_size for i in range(obj_count): - values.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + values.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) dic = {} for i in range(obj_count): dic[keys[i]] = values[i] return dic else: raise Exception('don\'t know how to unpack obj type '+hex(obj_type)+' at '+str(offset)) - + def __resolveObject(self, idx): try: return self.resolved[idx] @@ -212,7 +223,7 @@ class BPListReader(object): return newArr if type(obj) == dict: newDic = {} - for k,v in obj.items(): + for k, v in obj.items(): key_resolved = self.__resolveObject(k) if isinstance(key_resolved, str): rk = key_resolved @@ -225,15 +236,16 @@ class BPListReader(object): else: self.resolved[idx] = obj return obj - + def parse(self): # read header if self.data[:8] != b'bplist00': raise Exception('Bad magic') - + # read trailer - self.offset_size, self.object_ref_size, self.number_of_objects, self.top_object, self.table_offset = struct.unpack('!6xBB4xI4xI4xI', self.data[-32:]) - #print "** plist offset_size:",self.offset_size,"objref_size:",self.object_ref_size,"num_objs:",self.number_of_objects,"top:",self.top_object,"table_ofs:",self.table_offset + self.offset_size, self.object_ref_size, self.number_of_objects, self.top_object, self.table_offset = struct.unpack( + '!6xBB4xI4xI4xI', self.data[-32:]) + # print "** plist offset_size:",self.offset_size,"objref_size:",self.object_ref_size,"num_objs:",self.number_of_objects,"top:",self.top_object,"table_ofs:",self.table_offset # read offset table self.offset_table = self.data[self.table_offset:-32] @@ -243,40 +255,45 @@ class BPListReader(object): offset_entry = ot[:self.offset_size] ot = ot[self.offset_size:] self.offsets.append(self.__unpackIntStruct(self.offset_size, offset_entry)) - #print "** plist offsets:",self.offsets - + # print "** plist offsets:",self.offsets + # read object table self.objects = [] k = 0 for i in self.offsets: obj = self.__unpackItem(i) - #print "** plist unpacked",k,type(obj),obj,"at",i + # print "** plist unpacked",k,type(obj),obj,"at",i k += 1 self.objects.append(obj) - + # rebuild object tree - #for i in range(len(self.objects)): + # for i in range(len(self.objects)): # self.__resolveObject(i) - + # return root object return self.__resolveObject(self.top_object) - + @classmethod def plistWithString(cls, s): parser = cls(s) return parser.parse() # helpers for testing + + def plist(obj): from Foundation import NSPropertyListSerialization, NSPropertyListBinaryFormat_v1_0 - b = NSPropertyListSerialization.dataWithPropertyList_format_options_error_(obj, NSPropertyListBinaryFormat_v1_0, 0, None) + b = NSPropertyListSerialization.dataWithPropertyList_format_options_error_( + obj, NSPropertyListBinaryFormat_v1_0, 0, None) return str(b.bytes()) + def unplist(s): from Foundation import NSData, NSPropertyListSerialization d = NSData.dataWithBytes_length_(s, len(s)) return NSPropertyListSerialization.propertyListWithData_options_format_error_(d, 0, None, None) + if __name__ == "__main__": import os import sys diff --git a/Whatsapp_Chat_Exporter/exported_handler.py b/Whatsapp_Chat_Exporter/exported_handler.py index 4a81adb..9e53c23 100644 --- a/Whatsapp_Chat_Exporter/exported_handler.py +++ b/Whatsapp_Chat_Exporter/exported_handler.py @@ -14,12 +14,12 @@ logger = logging.getLogger(__name__) def messages(path, data, assume_first_as_me=False): """ Extracts messages from an exported WhatsApp chat file. - + Args: path: Path to the exported chat file data: Data container object to store the parsed chat assume_first_as_me: If True, assumes the first message is sent from the user without asking - + Returns: Updated data container with extracted messages """ @@ -27,16 +27,16 @@ def messages(path, data, assume_first_as_me=False): chat = data.add_chat("ExportedChat", ChatStore(Device.EXPORTED)) you = "" # Will store the username of the current user user_identification_done = False # Flag to track if user identification has been done - + # First pass: count total lines for progress reporting with open(path, "r", encoding="utf8") as file: total_row_number = sum(1 for _ in file) - + # Second pass: process the messages with open(path, "r", encoding="utf8") as file: for index, line in enumerate(file): you, user_identification_done = process_line( - line, index, chat, path, you, + line, index, chat, path, you, assume_first_as_me, user_identification_done ) @@ -51,31 +51,31 @@ def messages(path, data, assume_first_as_me=False): def process_line(line, index, chat, file_path, you, assume_first_as_me, user_identification_done): """ Process a single line from the chat file - + Returns: Tuple of (updated_you_value, updated_user_identification_done_flag) """ parts = line.split(" - ", 1) - + # Check if this is a new message (has timestamp format) if len(parts) > 1: time = parts[0] you, user_identification_done = process_new_message( - time, parts[1], index, chat, you, file_path, + time, parts[1], index, chat, you, file_path, assume_first_as_me, user_identification_done ) else: # This is a continuation of the previous message process_message_continuation(line, index, chat) - + return you, user_identification_done -def process_new_message(time, content, index, chat, you, file_path, +def process_new_message(time, content, index, chat, you, file_path, assume_first_as_me, user_identification_done): """ Process a line that contains a new message - + Returns: Tuple of (updated_you_value, updated_user_identification_done_flag) """ @@ -88,7 +88,7 @@ def process_new_message(time, content, index, chat, you, file_path, received_timestamp=None, read_timestamp=None ) - + # Check if this is a system message (no name:message format) if ":" not in content: msg.data = content @@ -96,7 +96,7 @@ def process_new_message(time, content, index, chat, you, file_path, else: # Process user message name, message = content.strip().split(":", 1) - + # Handle user identification if you == "": if chat.name is None: @@ -113,17 +113,17 @@ def process_new_message(time, content, index, chat, you, file_path, # If we know the chat name, anyone else must be "you" if name != chat.name: you = name - + # Set the chat name if needed if chat.name is None and name != you: chat.name = name - + # Determine if this message is from the current user msg.from_me = (name == you) - + # Process message content process_message_content(msg, message, file_path) - + chat.add_message(index, msg) return you, user_identification_done @@ -144,11 +144,11 @@ def process_attached_file(msg, message, file_path): """Process an attached file in a message""" mime = MimeTypes() msg.media = True - + # Extract file path and check if it exists file_name = message.split("(file attached)")[0].strip() attached_file_path = os.path.join(os.path.dirname(file_path), file_name) - + if os.path.isfile(attached_file_path): msg.data = attached_file_path guess = mime.guess_type(attached_file_path)[0] @@ -165,9 +165,9 @@ def process_message_continuation(line, index, chat): lookback = index - 1 while lookback not in chat.keys(): lookback -= 1 - + msg = chat.get_message(lookback) - + # Add the continuation line to the message if msg.media: msg.caption = line.strip() @@ -182,4 +182,4 @@ def prompt_for_user_identification(name): if ans == "y": return name elif ans == "n": - return "" \ No newline at end of file + return "" diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index a833c7f..14a43e0 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -21,14 +21,14 @@ def contacts(db, data): c.execute("""SELECT count() FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") total_row_number = c.fetchone()[0] logger.info(f"Pre-processing contacts...({total_row_number})\r") - + c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") content = c.fetchone() while content is not None: zwhatsapp_id = content["ZWHATSAPPID"] if not zwhatsapp_id.endswith("@s.whatsapp.net"): zwhatsapp_id += "@s.whatsapp.net" - + current_chat = ChatStore(Device.IOS) current_chat.status = content["ZABOUTTEXT"] data.add_chat(zwhatsapp_id, current_chat) @@ -40,7 +40,7 @@ def process_contact_avatars(current_chat, media_folder, contact_id): """Process and assign avatar images for a contact.""" path = f'{media_folder}/Media/Profile/{contact_id.split("@")[0]}' avatars = glob(f"{path}*") - + if 0 < len(avatars) <= 1: current_chat.their_avatar = avatars[0] else: @@ -64,12 +64,14 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """Process WhatsApp messages and contacts from the database.""" c = db.cursor() cursor2 = db.cursor() - + # Build the chat filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Process contacts first contact_query = f""" SELECT count() @@ -110,13 +112,13 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, GROUP BY ZCONTACTJID; """ c.execute(contacts_query) - + # Process each contact content = c.fetchone() while content is not None: contact_name = get_contact_name(content) contact_id = content["ZCONTACTJID"] - + # Add or update chat if contact_id not in data: current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder)) @@ -124,11 +126,11 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, current_chat = data.get_chat(contact_id) current_chat.name = contact_name current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") - + # Process avatar images process_contact_avatars(current_chat, media_folder, contact_id) content = c.fetchone() - + logger.info(f"Processed {total_row_number} contacts{CLEAR_LINE}") # Get message count @@ -147,7 +149,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, c.execute(message_count_query) total_row_number = c.fetchone()[0] logger.info(f"Processing messages...(0/{total_row_number})\r") - + # Fetch messages messages_query = f""" SELECT ZCONTACTJID, @@ -175,7 +177,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, ORDER BY ZMESSAGEDATE ASC; """ c.execute(messages_query) - + # Process each message i = 0 content = c.fetchone() @@ -183,14 +185,14 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, contact_id = content["ZCONTACTJID"] message_pk = content["Z_PK"] is_group_message = content["ZGROUPINFO"] is not None - + # Ensure chat exists if contact_id not in data: current_chat = data.add_chat(contact_id, ChatStore(Device.IOS)) process_contact_avatars(current_chat, media_folder, contact_id) else: current_chat = data.get_chat(contact_id) - + # Create message object ts = APPLE_TIME + content["ZMESSAGEDATE"] message = Message( @@ -203,14 +205,14 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None, read_timestamp=None # TODO: Add timestamp ) - + # Process message data invalid = process_message_data(message, content, is_group_message, data, cursor2) - + # Add valid messages to chat if not invalid: current_chat.add_message(message_pk, message) - + # Update progress i += 1 if i % 1000 == 0: @@ -236,11 +238,11 @@ def process_message_data(message, content, is_group_message, data, cursor2): message.sender = name or fallback else: message.sender = None - + # Handle metadata messages if content["ZMESSAGETYPE"] == 6: return process_metadata_message(message, content, is_group_message) - + # Handle quoted replies if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14") and False: quoted = content["ZMETADATA"][2:19] @@ -250,17 +252,17 @@ def process_message_data(message, content, is_group_message, data, cursor2): WHERE ZSTANZAID LIKE '{message.reply}%'""") quoted_content = cursor2.fetchone() if quoted_content and "ZTEXT" in quoted_content: - message.quoted_data = quoted_content["ZTEXT"] + message.quoted_data = quoted_content["ZTEXT"] else: message.quoted_data = None - + # Handle stickers if content["ZMESSAGETYPE"] == 15: message.sticker = True # Process message text process_message_text(message, content) - + return False # Message is valid @@ -305,19 +307,21 @@ def process_message_text(message, content): msg = content["ZTEXT"] if msg is not None: msg = msg.replace("\r\n", "
").replace("\n", "
") - + message.data = msg def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False): """Process media files from WhatsApp messages.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID","ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Get media count media_count_query = f""" SELECT count() @@ -336,7 +340,7 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa c.execute(media_count_query) total_row_number = c.fetchone()[0] logger.info(f"Processing media...(0/{total_row_number})\r") - + # Fetch media items media_query = f""" SELECT ZCONTACTJID, @@ -360,14 +364,14 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa ORDER BY ZCONTACTJID ASC """ c.execute(media_query) - + # Process each media item mime = MimeTypes() i = 0 content = c.fetchone() while content is not None: process_media_item(content, data, media_folder, mime, separate_media) - + # Update progress i += 1 if i % 100 == 0: @@ -382,23 +386,24 @@ def process_media_item(content, data, media_folder, mime, separate_media): current_chat = data.get_chat(content["ZCONTACTJID"]) message = current_chat.get_message(content["ZMESSAGE"]) message.media = True - + if current_chat.media_base == "": current_chat.media_base = media_folder + "/" - + if os.path.isfile(file_path): message.data = '/'.join(file_path.split("/")[1:]) - + # Set MIME type if content["ZVCARDSTRING"] is None: guess = mime.guess_type(file_path)[0] message.mime = guess if guess is not None else "application/octet-stream" else: message.mime = content["ZVCARDSTRING"] - + # Handle separate media option if separate_media: - chat_display_name = slugify(current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) + chat_display_name = slugify( + current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) @@ -410,7 +415,7 @@ def process_media_item(content, data, media_folder, mime, separate_media): message.data = "The media is missing" message.mime = "media" message.meta = True - + # Add caption if available if content["ZTITLE"] is not None: message.caption = content["ZTITLE"] @@ -419,12 +424,14 @@ def process_media_item(content, data, media_folder, mime, separate_media): def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): """Process vCard contacts from WhatsApp messages.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZWAMESSAGE.ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Fetch vCard mentions vcard_query = f""" SELECT DISTINCT ZWAVCARDMENTION.ZMEDIAITEM, @@ -450,7 +457,7 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): contents = c.fetchall() total_row_number = len(contents) logger.info(f"Processing vCards...(0/{total_row_number})\r") - + # Create vCards directory path = f'{media_folder}/Message/vCards' Path(path).mkdir(parents=True, exist_ok=True) @@ -484,9 +491,10 @@ def process_vcard_item(content, path, data): f.write(vcard_string) # Create vCard summary and update message - vcard_summary = "This media include the following vCard file(s):
" - vcard_summary += " | ".join([f'{htmle(name)}' for name, fp in zip(vcard_names, file_paths)]) - + vcard_summary = "This media include the following vCard file(s):
" + vcard_summary += " | ".join([f'{htmle(name)}' for name, + fp in zip(vcard_names, file_paths)]) + message = data.get_chat(content["ZCONTACTJID"]).get_message(content["ZMESSAGE"]) message.data = vcard_summary message.mime = "text/x-vcard" @@ -498,11 +506,13 @@ def process_vcard_item(content, path, data): def calls(db, data, timezone_offset, filter_chat): """Process WhatsApp call records.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") - + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + # Get call count call_count_query = f""" SELECT count() @@ -515,9 +525,9 @@ def calls(db, data, timezone_offset, filter_chat): total_row_number = c.fetchone()[0] if total_row_number == 0: return - + logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}\n") - + # Fetch call records calls_query = f""" SELECT ZCALLIDSTRING, @@ -538,16 +548,16 @@ def calls(db, data, timezone_offset, filter_chat): {chat_filter_exclude} """ c.execute(calls_query) - + # Create calls chat chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - + # Process each call content = c.fetchone() while content is not None: process_call_record(content, chat, data, timezone_offset) content = c.fetchone() - + # Add calls chat to data data.add_chat("000000000000000", chat) @@ -562,7 +572,7 @@ def process_call_record(content, chat, data, timezone_offset): key_id=content["ZCALLIDSTRING"], timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET ) - + # Set sender info _jid = content["ZGROUPCALLCREATORUSERJIDSTRING"] name = data.get_chat(_jid).name if _jid in data else None @@ -571,11 +581,11 @@ def process_call_record(content, chat, data, timezone_offset): else: fallback = None call.sender = name or fallback - + # Set call metadata call.meta = True call.data = format_call_data(call, content) - + # Add call to chat chat.add_message(call.key_id, call) @@ -589,7 +599,7 @@ def format_call_data(call, content): f"call {'to' if call.from_me else 'from'} " f"{call.sender} was " ) - + # Call outcome if content['ZOUTCOME'] in (1, 4): call_data += "not answered." if call.from_me else "missed." @@ -604,5 +614,5 @@ def format_call_data(call, content): ) else: call_data += "in an unknown state." - - return call_data \ No newline at end of file + + return call_data diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index a3c26e2..bee7d9b 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -18,6 +18,7 @@ else: logger = logging.getLogger(__name__) + class BackupExtractor: """ A class to handle the extraction of WhatsApp data from iOS backups, @@ -61,9 +62,9 @@ class BackupExtractor: """ if not support_encrypted: logger.error("You don't have the dependencies to handle encrypted backup." - "Read more on how to deal with encrypted backup:" - "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" - ) + "Read more on how to deal with encrypted backup:" + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" + ) return logger.info(f"Encryption detected on the backup!{CLEAR_LINE}") @@ -116,12 +117,12 @@ class BackupExtractor: exit(6) else: logger.info(f"Done{CLEAR_LINE}") - + def _extract_decrypted_files(self): """Extract all WhatsApp files after decryption""" def extract_progress_handler(file_id, domain, relative_path, n, total_files): if n % 100 == 0: - logger.info(f"Decrypting and extracting files...({n}/{total_files})\r") + logger.info(f"Decrypting and extracting files...({n}/{total_files})\r") return True self.backup.extract_files( @@ -234,4 +235,3 @@ def extract_media(base_dir, identifiers, decrypt_chunk_size): """ extractor = BackupExtractor(base_dir, identifiers, decrypt_chunk_size) extractor.extract() - diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 08ea6b0..e656ddc 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -33,6 +33,7 @@ CLEAR_LINE = "\x1b[K\n" logger = logging.getLogger(__name__) + def convert_time_unit(time_second: int) -> str: """Converts a time duration in seconds to a human-readable string. @@ -168,7 +169,7 @@ def check_update(): "===============Update===============\n" "A newer version of WhatsApp Chat Exporter is available.\n" f"Current version: {__version__}\n" - f"Latest version: {package_info['info']['version']}\n" + f"Latest version: {package_info['info']['version']}\n" ) if platform == "win32": logger.info("Update with: pip install --upgrade whatsapp-chat-exporter\n") diff --git a/scripts/brazilian_number_processing.py b/scripts/brazilian_number_processing.py index bb51213..a42678c 100644 --- a/scripts/brazilian_number_processing.py +++ b/scripts/brazilian_number_processing.py @@ -6,19 +6,20 @@ Contributed by @magpires https://github.com/KnugiHK/WhatsApp-Chat-Exporter/issue import re import argparse + def process_phone_number(raw_phone): """ Process the raw phone string from the VCARD and return two formatted numbers: - The original formatted number, and - A modified formatted number with the extra (ninth) digit removed, if applicable. - + Desired output: For a number with a 9-digit subscriber: Original: "+55 {area} {first 5 of subscriber}-{last 4 of subscriber}" Modified: "+55 {area} {subscriber[1:5]}-{subscriber[5:]}" For example, for an input that should represent "027912345678", the outputs are: "+55 27 91234-5678" and "+55 27 1234-5678" - + This function handles numbers that may already include a "+55" prefix. It expects that after cleaning, a valid number (without the country code) should have either 10 digits (2 for area + 8 for subscriber) or 11 digits (2 for area + 9 for subscriber). @@ -26,18 +27,18 @@ def process_phone_number(raw_phone): """ # Store the original input for processing number_to_process = raw_phone.strip() - + # Remove all non-digit characters digits = re.sub(r'\D', '', number_to_process) - + # If the number starts with '55', remove it for processing if digits.startswith("55") and len(digits) > 11: digits = digits[2:] - + # Remove trunk zero if present if digits.startswith("0"): digits = digits[1:] - + # After cleaning, we expect a valid number to have either 10 or 11 digits # If there are extra digits, use the last 11 (for a 9-digit subscriber) or last 10 (for an 8-digit subscriber) if len(digits) > 11: @@ -46,7 +47,7 @@ def process_phone_number(raw_phone): elif len(digits) > 10 and len(digits) < 11: # In some cases with an 8-digit subscriber, take the last 10 digits digits = digits[-10:] - + # Check if we have a valid number after processing if len(digits) not in (10, 11): return None, None @@ -70,6 +71,7 @@ def process_phone_number(raw_phone): return original_formatted, modified_formatted + def process_vcard(input_vcard, output_vcard): """ Process a VCARD file to standardize telephone entries and add a second TEL line @@ -77,13 +79,13 @@ def process_vcard(input_vcard, output_vcard): """ with open(input_vcard, 'r', encoding='utf-8') as file: lines = file.readlines() - + output_lines = [] - + # Regex to capture any telephone line. # It matches lines starting with "TEL:" or "TEL;TYPE=..." or with prefixes like "item1.TEL:". phone_pattern = re.compile(r'^(?P.*TEL(?:;TYPE=[^:]+)?):(?P.*)$') - + for line in lines: stripped_line = line.rstrip("\n") match = phone_pattern.match(stripped_line) @@ -99,10 +101,11 @@ def process_vcard(input_vcard, output_vcard): output_lines.append(f"TEL;TYPE=CELL:{mod_formatted}\n") else: output_lines.append(line) - + with open(output_vcard, 'w', encoding='utf-8') as file: file.writelines(output_lines) + if __name__ == '__main__': parser = argparse.ArgumentParser( description="Process a VCARD file to standardize telephone entries and add a second TEL line with the modified number (removing the extra ninth digit) for contacts with 9-digit subscribers." @@ -110,6 +113,6 @@ if __name__ == '__main__': parser.add_argument('input_vcard', type=str, help='Input VCARD file') parser.add_argument('output_vcard', type=str, help='Output VCARD file') args = parser.parse_args() - + process_vcard(args.input_vcard, args.output_vcard) - print(f"VCARD processed and saved to {args.output_vcard}") \ No newline at end of file + print(f"VCARD processed and saved to {args.output_vcard}") diff --git a/scripts/bruteforce_crypt15.py b/scripts/bruteforce_crypt15.py index c8646d4..d4497ce 100644 --- a/scripts/bruteforce_crypt15.py +++ b/scripts/bruteforce_crypt15.py @@ -26,6 +26,7 @@ def _extract_encrypted_key(keyfile): return _generate_hmac_of_hmac(key_stream) + if __name__ == "__main__": key = open("encrypted_backup.key", "rb").read() database = open("wa.db.crypt15", "rb").read() diff --git a/tests/test_brazilian_number_processing.py b/tests/test_brazilian_number_processing.py index f7ccd27..3612481 100644 --- a/tests/test_brazilian_number_processing.py +++ b/tests/test_brazilian_number_processing.py @@ -6,11 +6,12 @@ from unittest.mock import patch from scripts.brazilian_number_processing import process_phone_number, process_vcard + class TestVCardProcessor(unittest.TestCase): - + def test_process_phone_number(self): """Test the process_phone_number function with various inputs.""" - + # Test cases for 9-digit subscriber numbers test_cases_9_digit = [ # Standard 11-digit number (2 area + 9 subscriber) @@ -30,7 +31,7 @@ class TestVCardProcessor(unittest.TestCase): # With extra non-digit characters ("+55-27-9.1234_5678", "+55 27 91234-5678", "+55 27 1234-5678"), ] - + # Test cases for 8-digit subscriber numbers test_cases_8_digit = [ # Standard 10-digit number (2 area + 8 subscriber) @@ -46,7 +47,7 @@ class TestVCardProcessor(unittest.TestCase): # With country code and trunk zero ("+55 0 27 1234-5678", "+55 27 1234-5678", None), ] - + # Edge cases edge_cases = [ # Too few digits @@ -60,19 +61,19 @@ class TestVCardProcessor(unittest.TestCase): # Unusual formatting but valid number ("(+55) [27] 9.1234_5678", "+55 27 91234-5678", "+55 27 1234-5678"), ] - + # Run tests for all cases all_cases = test_cases_9_digit + test_cases_8_digit + edge_cases - + for raw_phone, expected_orig, expected_mod in all_cases: with self.subTest(raw_phone=raw_phone): orig, mod = process_phone_number(raw_phone) self.assertEqual(orig, expected_orig) self.assertEqual(mod, expected_mod) - + def test_process_vcard(self): """Test the process_vcard function with various VCARD formats.""" - + # Test case 1: Standard TEL entries vcard1 = """BEGIN:VCARD VERSION:3.0 @@ -202,26 +203,26 @@ END:VCARD (vcard5, expected5), (vcard6, expected6) ] - + for i, (input_vcard, expected_output) in enumerate(test_cases): with self.subTest(case=i+1): # Create temporary files for input and output with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8') as input_file: input_file.write(input_vcard) input_path = input_file.name - + output_path = input_path + '.out' - + try: # Process the VCARD process_vcard(input_path, output_path) - + # Read and verify the output with open(output_path, 'r', encoding='utf-8') as output_file: actual_output = output_file.read() self.assertEqual(actual_output, expected_output) - + finally: # Clean up temporary files if os.path.exists(input_path): @@ -231,7 +232,7 @@ END:VCARD def test_script_argument_handling(self): """Test the script's command-line argument handling.""" - + test_input = """BEGIN:VCARD VERSION:3.0 N:Test;User;;; @@ -239,16 +240,17 @@ FN:User Test TEL:+5527912345678 END:VCARD """ - + # Create a temporary input file with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8') as input_file: input_file.write(test_input) input_path = input_file.name - + output_path = input_path + '.out' - + try: - test_args = ['python' if os.name == 'nt' else 'python3', 'scripts/brazilian_number_processing.py', input_path, output_path] + test_args = ['python' if os.name == 'nt' else 'python3', + 'scripts/brazilian_number_processing.py', input_path, output_path] # We're just testing that the argument parsing works subprocess.call( test_args, @@ -257,7 +259,7 @@ END:VCARD ) # Check if the output file was created self.assertTrue(os.path.exists(output_path)) - + finally: # Clean up temporary files if os.path.exists(input_path): @@ -265,5 +267,6 @@ END:VCARD if os.path.exists(output_path): os.unlink(output_path) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py index 5d6ef58..527e5ae 100644 --- a/tests/test_incremental_merge.py +++ b/tests/test_incremental_merge.py @@ -178,14 +178,14 @@ def test_incremental_merge_new_file(mock_filesystem): source_dir = "/source" target_dir = "/target" media_dir = "media" - + # Setup mock filesystem mock_filesystem["exists"].side_effect = lambda x: x == "/source" mock_filesystem["listdir"].return_value = ["chat.json"] - + # Run the function incremental_merge(source_dir, target_dir, media_dir, 2, True) - + # Verify the operations mock_filesystem["makedirs"].assert_called_once_with(target_dir, exist_ok=True) mock_filesystem["copy2"].assert_called_once_with( diff --git a/tests/test_nuitka_binary.py b/tests/test_nuitka_binary.py index 0e91930..0cd986e 100644 --- a/tests/test_nuitka_binary.py +++ b/tests/test_nuitka_binary.py @@ -43,9 +43,9 @@ def test_nuitka_binary(): "--assume-yes-for-downloads", "--follow-imports", "Whatsapp_Chat_Exporter/__main__.py", - "--output-filename=wtsexporter.exe" # use .exe on all platforms for compatibility + "--output-filename=wtsexporter.exe" # use .exe on all platforms for compatibility ] - + compile_result = subprocess.run( nuitka_command, capture_output=True, diff --git a/tests/test_vcards_contacts.py b/tests/test_vcards_contacts.py index 5d22b56..62cf603 100644 --- a/tests/test_vcards_contacts.py +++ b/tests/test_vcards_contacts.py @@ -8,12 +8,15 @@ def test_readVCardsFile(): data_dir = os.path.join(os.path.dirname(__file__), "data") assert len(read_vcards_file(os.path.join(data_dir, "contacts.vcf"), "852")) > 0 + def test_create_number_to_name_dicts(): pass + def test_fuzzy_match_numbers(): pass + def test_normalize_number(): assert normalize_number('0531234567', '1') == '1531234567' assert normalize_number('001531234567', '2') == '1531234567' From 587b74352216cd9472c32c1befbd1801d77fa67f Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 18:14:41 +0800 Subject: [PATCH 050/108] Fix logging for decrypting whatsapp database --- Whatsapp_Chat_Exporter/ios_media_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index bee7d9b..a010c92 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -87,8 +87,8 @@ class BackupExtractor: check_same_thread=False, decrypt_chunk_size=self.decrypt_chunk_size, ) - logger.info(f"iOS backup decrypted successfully!{CLEAR_LINE}") - logger.info("Decrypting WhatsApp database...\n") + logger.info(f"iOS backup decrypted successfully{CLEAR_LINE}") + logger.info("Decrypting WhatsApp database...\r") try: self.backup.extract_file( relative_path=RelativePath.WHATSAPP_MESSAGES, @@ -116,7 +116,7 @@ class BackupExtractor: ) exit(6) else: - logger.info(f"Done{CLEAR_LINE}") + logger.info(f"WhatsApp database decrypted successfully{CLEAR_LINE}") def _extract_decrypted_files(self): """Extract all WhatsApp files after decryption""" From 96e483a6b0a2dc10a0b67bb9759a37812f426d70 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 11 May 2025 18:16:17 +0800 Subject: [PATCH 051/108] Clean up unused code in bplist.py --- Whatsapp_Chat_Exporter/bplist.py | 64 -------------------------------- 1 file changed, 64 deletions(-) diff --git a/Whatsapp_Chat_Exporter/bplist.py b/Whatsapp_Chat_Exporter/bplist.py index 4ca572a..126dfc9 100644 --- a/Whatsapp_Chat_Exporter/bplist.py +++ b/Whatsapp_Chat_Exporter/bplist.py @@ -25,40 +25,6 @@ import codecs from datetime import datetime, timedelta -class BPListWriter(object): - def __init__(self, objects): - self.bplist = "" - self.objects = objects - - def binary(self): - '''binary -> string - - Generates bplist - ''' - self.data = 'bplist00' - - # TODO: flatten objects and count max length size - - # TODO: write objects and save offsets - - # TODO: write offsets - - # TODO: write metadata - - return self.data - - def write(self, filename): - ''' - - Writes bplist to file - ''' - if self.bplist != "": - pass - # TODO: save self.bplist to file - else: - raise Exception('BPlist not yet generated') - - class BPListReader(object): def __init__(self, s): self.data = s @@ -277,33 +243,3 @@ class BPListReader(object): def plistWithString(cls, s): parser = cls(s) return parser.parse() - -# helpers for testing - - -def plist(obj): - from Foundation import NSPropertyListSerialization, NSPropertyListBinaryFormat_v1_0 - b = NSPropertyListSerialization.dataWithPropertyList_format_options_error_( - obj, NSPropertyListBinaryFormat_v1_0, 0, None) - return str(b.bytes()) - - -def unplist(s): - from Foundation import NSData, NSPropertyListSerialization - d = NSData.dataWithBytes_length_(s, len(s)) - return NSPropertyListSerialization.propertyListWithData_options_format_error_(d, 0, None, None) - - -if __name__ == "__main__": - import os - import sys - import json - file_path = sys.argv[1] - - with open(file_path, "rb") as fp: - data = fp.read() - - out = BPListReader(data).parse() - - with open(file_path + ".json", "w") as fp: - json.dump(out, indent=4) From 3f88f7fe08f4392fb24e3b48120b7ce23772ff91 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 16:04:31 +0800 Subject: [PATCH 052/108] Replacing slugify with a new function --- Whatsapp_Chat_Exporter/android_handler.py | 6 ++-- Whatsapp_Chat_Exporter/ios_handler.py | 6 ++-- Whatsapp_Chat_Exporter/utility.py | 33 +++++++++-------- tests/test_utility.py | 43 +++++++++++++++++++++++ 4 files changed, 67 insertions(+), 21 deletions(-) create mode 100644 tests/test_utility.py diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index d371b4f..46e0511 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -13,7 +13,7 @@ from Whatsapp_Chat_Exporter.data_model import ChatStore, Message from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata -from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable logger = logging.getLogger(__name__) @@ -668,8 +668,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media): # Copy media to separate folder if needed if separate_media: - chat_display_name = slugify(current_chat.name or message.sender - or content["key_remote_jid"].split('@')[0], True) + chat_display_name = safe_name(current_chat.name or message.sender + or content["key_remote_jid"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 14a43e0..3c40202 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -9,7 +9,7 @@ from mimetypes import MimeTypes from markupsafe import escape as htmle from Whatsapp_Chat_Exporter.data_model import ChatStore, Message from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition -from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device +from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device logger = logging.getLogger(__name__) @@ -402,8 +402,8 @@ def process_media_item(content, data, media_folder, mime, separate_media): # Handle separate media option if separate_media: - chat_display_name = slugify( - current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) + chat_display_name = safe_name( + current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index e656ddc..9eda832 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -5,6 +5,7 @@ import json import os import unicodedata import re +import string import math import shutil from bleach import clean as sanitize @@ -12,7 +13,7 @@ from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union try: from enum import StrEnum, IntEnum except ImportError: @@ -600,26 +601,28 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool APPLE_TIME = 978307200 -def slugify(value: str, allow_unicode: bool = False) -> str: +def safe_name(text: Union[str|bytes]) -> str: """ - Convert text to ASCII-only slugs for URL-safe strings. - Taken from https://github.com/django/django/blob/master/django/utils/text.py + Sanitize the input text and generates a safe file name. + This function serves a similar purpose to slugify() from + Django previously used in this project, but is a clean-room + Reimplementation tailored for performance and a narrower + Use case for this project. Licensed under the same terms + As the project (MIT). Args: - value (str): The string to convert to a slug. - allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False. + text (str|bytes): The string to be sanitized. Returns: - str: The slugified string with only alphanumerics, underscores, or hyphens. + str: The sanitized string with only alphanumerics, underscores, or hyphens. """ - value = str(value) - if allow_unicode: - value = unicodedata.normalize('NFKC', value) - else: - value = unicodedata.normalize('NFKD', value).encode( - 'ascii', 'ignore').decode('ascii') - value = re.sub(r'[^\w\s-]', '', value.lower()) - return re.sub(r'[-\s]+', '-', value).strip('-_') + if isinstance(text, bytes): + text = text.decode("utf-8", "ignore") + elif not isinstance(text, str): + raise TypeError("value must be a string or bytes") + normalized_text = unicodedata.normalize("NFKC", text) + safe_chars = [char for char in normalized_text if char.isalnum() or char in "-_ ."] + return "-".join(''.join(safe_chars).split()) class WhatsAppIdentifier(StrEnum): diff --git a/tests/test_utility.py b/tests/test_utility.py new file mode 100644 index 0000000..bd4bd23 --- /dev/null +++ b/tests/test_utility.py @@ -0,0 +1,43 @@ +import pytest +import random +import string + +from Whatsapp_Chat_Exporter.utility import safe_name + +def generate_random_string(length=50): + random.seed(10) + return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length)) + + +# Test cases to validate the safe_name function +safe_name_test_cases = [ + ("This is a test string", "This-is-a-test-string"), + ("This is a test string with special characters!@#$%^&*()", "This-is-a-test-string-with-special-characters"), + ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"), + ("This is a test string with mixed case ThisIsATestString", "This-is-a-test-string-with-mixed-case-ThisIsATestString"), + ("This is a test string with extra spaces ThisIsATestString", "This-is-a-test-string-with-extra-spaces-ThisIsATestString"), + ("This is a test string with unicode characters äöüß", "This-is-a-test-string-with-unicode-characters-äöüß"), + ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is + ( + f"This is a test string with long length {generate_random_string(1000)}", + f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}", + ), + ("", ""), # Empty string + (" ", ""), # String with only space + ("---", "---"), # String with only hyphens + ("___", "___"), # String with only underscores + ("a" * 100, "a" * 100), # Long string with single character + ("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen + ("a_b_c_d_e", "a_b_c_d_e"), # String with underscore + ("a b c d e", "a-b-c-d-e"), # String with spaces + ("test.com/path/to/resource?param1=value1¶m2=value2", "test.compathtoresourceparam1value1param2value2"), # Test with URL + ("filename.txt", "filename.txt"), # Test with filename + ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters + ("This is a test with комбинированные знаки ̆ example", "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode +] + + +@pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases) +def test_safe_name(input_text, expected_output): + result = safe_name(input_text) + assert result == expected_output \ No newline at end of file From a53e5a2b3dea13d157d91aff1ee91fb6d2f9b25d Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 16:18:16 +0800 Subject: [PATCH 053/108] Update type hint syntax for Python < 3.10 compatibility --- Whatsapp_Chat_Exporter/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 9eda832..afcebd9 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -601,7 +601,7 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool APPLE_TIME = 978307200 -def safe_name(text: Union[str|bytes]) -> str: +def safe_name(text: Union[str, bytes]) -> str: """ Sanitize the input text and generates a safe file name. This function serves a similar purpose to slugify() from From 8c2868a60ec6af34a503b5d25ae66419e9c52913 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 16:20:11 +0800 Subject: [PATCH 054/108] Fix on missing return in get_status_location --- Whatsapp_Chat_Exporter/utility.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index afcebd9..6dcb40f 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -567,6 +567,7 @@ def get_status_location(output_folder: str, offline_static: str) -> str: with open(w3css_path, "wb") as f: f.write(resp.read()) w3css = os.path.join(offline_static, "w3.css") + return w3css def setup_template(template: Optional[str], no_avatar: bool, experimental: bool = False) -> jinja2.Template: From d7ded16239c4c39304af08ed4b029df3ddc571cc Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 17:35:30 +0800 Subject: [PATCH 055/108] Reimplement the convert_time_unit function to make it more human-readable --- Whatsapp_Chat_Exporter/utility.py | 42 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 6dcb40f..75744bb 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -44,23 +44,31 @@ def convert_time_unit(time_second: int) -> str: Returns: str: A human-readable string representing the time duration. """ - time = str(timedelta(seconds=time_second)) - if "day" not in time: - if time_second < 1: - time = "less than a second" - elif time_second == 1: - time = "a second" - elif time_second < 60: - time = time[5:][1 if time_second < 10 else 0:] + " seconds" - elif time_second == 60: - time = "a minute" - elif time_second < 3600: - time = time[2:] + " minutes" - elif time_second == 3600: - time = "an hour" - else: - time += " hour" - return time + if time_second < 1: + return "less than a second" + elif time_second == 1: + return "a second" + + delta = timedelta(seconds=time_second) + parts = [] + + days = delta.days + if days > 0: + parts.append(f"{days} day{'s' if days > 1 else ''}") + + hours = delta.seconds // 3600 + if hours > 0: + parts.append(f"{hours} hour{'s' if hours > 1 else ''}") + + minutes = (delta.seconds % 3600) // 60 + if minutes > 0: + parts.append(f"{minutes} minute{'s' if minutes > 1 else ''}") + + seconds = delta.seconds % 60 + if seconds > 0: + parts.append(f"{seconds} second{'s' if seconds > 1 else ''}") + + return " ".join(parts) def bytes_to_readable(size_bytes: int) -> str: From ea6e72bf0b3acd4e846c778592432fbe3acd11a6 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 17:46:51 +0800 Subject: [PATCH 056/108] Bug fix on incorrectly striping decimal to integer --- Whatsapp_Chat_Exporter/utility.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 75744bb..2053133 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -118,10 +118,11 @@ def readable_to_bytes(size_str: str) -> int: 'YB': 1024**8 } size_str = size_str.upper().strip() - number, unit = size_str[:-2].strip(), size_str[-2:].strip() - if unit not in SIZE_UNITS or not number.isnumeric(): + unit = ''.join(filter(str.isalpha, size_str)).strip() + number = ''.join(c for c in size_str if c.isdigit() or c == '.').strip() + if unit not in SIZE_UNITS: raise ValueError("Invalid input for size_str. Example: 1024GB") - return int(number) * SIZE_UNITS[unit] + return int(float(number) * SIZE_UNITS[unit]) def sanitize_except(html: str) -> Markup: From bbb558713feaf61c6582691f90de71c9438af27c Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 18:24:30 +0800 Subject: [PATCH 057/108] Replace sanitize_filename with safe_name --- Whatsapp_Chat_Exporter/__main__.py | 8 ++++---- Whatsapp_Chat_Exporter/utility.py | 14 +------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 0a7aae3..ff44531 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -12,9 +12,9 @@ import importlib.metadata from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update, DbType -from Whatsapp_Chat_Exporter.utility import readable_to_bytes, sanitize_filename -from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update +from Whatsapp_Chat_Exporter.utility import readable_to_bytes, safe_name, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, DbType from argparse import ArgumentParser, SUPPRESS from datetime import datetime from getpass import getpass @@ -683,7 +683,7 @@ def export_multiple_json(args, data: Dict) -> None: else: contact = jik.replace('+', '') - with open(f"{json_path}/{sanitize_filename(contact)}.json", "w") as f: + with open(f"{json_path}/{safe_name(contact)}.json", "w") as f: file_content = json.dumps( {jik: data[jik]}, ensure_ascii=not args.avoid_encoding_json, diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 2053133..5d52d2d 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -334,18 +334,6 @@ def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_p shutil.copy2(source_file, target_file) -def sanitize_filename(file_name: str) -> str: - """Sanitizes a filename by removing invalid and unsafe characters. - - Args: - file_name: The filename to sanitize. - - Returns: - The sanitized filename. - """ - return "".join(x for x in file_name if x.isalnum() or x in "- ") - - def get_file_name(contact: str, chat: ChatStore) -> Tuple[str, str]: """Generates a sanitized filename and contact name for a chat. @@ -375,7 +363,7 @@ def get_file_name(contact: str, chat: ChatStore) -> Tuple[str, str]: else: name = phone_number - return sanitize_filename(file_name), name + return safe_name(file_name), name def get_cond_for_empty(enable: bool, jid_field: str, broadcast_field: str) -> str: From 6a0fca3e9da3befa7605c2030142237bb2493873 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 19:16:57 +0800 Subject: [PATCH 058/108] Add more tests for utility --- tests/test_utility.py | 285 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 249 insertions(+), 36 deletions(-) diff --git a/tests/test_utility.py b/tests/test_utility.py index bd4bd23..1e9012d 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -1,43 +1,256 @@ import pytest import random import string - -from Whatsapp_Chat_Exporter.utility import safe_name - -def generate_random_string(length=50): - random.seed(10) - return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length)) +from unittest.mock import patch, mock_open, MagicMock +from Whatsapp_Chat_Exporter.utility import * -# Test cases to validate the safe_name function -safe_name_test_cases = [ - ("This is a test string", "This-is-a-test-string"), - ("This is a test string with special characters!@#$%^&*()", "This-is-a-test-string-with-special-characters"), - ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"), - ("This is a test string with mixed case ThisIsATestString", "This-is-a-test-string-with-mixed-case-ThisIsATestString"), - ("This is a test string with extra spaces ThisIsATestString", "This-is-a-test-string-with-extra-spaces-ThisIsATestString"), - ("This is a test string with unicode characters äöüß", "This-is-a-test-string-with-unicode-characters-äöüß"), - ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is - ( - f"This is a test string with long length {generate_random_string(1000)}", - f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}", - ), - ("", ""), # Empty string - (" ", ""), # String with only space - ("---", "---"), # String with only hyphens - ("___", "___"), # String with only underscores - ("a" * 100, "a" * 100), # Long string with single character - ("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen - ("a_b_c_d_e", "a_b_c_d_e"), # String with underscore - ("a b c d e", "a-b-c-d-e"), # String with spaces - ("test.com/path/to/resource?param1=value1¶m2=value2", "test.compathtoresourceparam1value1param2value2"), # Test with URL - ("filename.txt", "filename.txt"), # Test with filename - ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters - ("This is a test with комбинированные знаки ̆ example", "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode -] +def test_convert_time_unit(): + assert convert_time_unit(0) == "less than a second" + assert convert_time_unit(1) == "a second" + assert convert_time_unit(10) == "10 seconds" + assert convert_time_unit(60) == "1 minute" + assert convert_time_unit(61) == "1 minute 1 second" + assert convert_time_unit(122) == "2 minutes 2 seconds" + assert convert_time_unit(3600) == "1 hour" + assert convert_time_unit(3661) == "1 hour 1 minute 1 second" + assert convert_time_unit(3720) == "1 hour 2 minutes" + assert convert_time_unit(3660) == "1 hour 1 minute" + assert convert_time_unit(7263) == "2 hours 1 minute 3 seconds" + assert convert_time_unit(86400) == "1 day" + assert convert_time_unit(86461) == "1 day 1 minute 1 second" + assert convert_time_unit(172805) == "2 days 5 seconds" -@pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases) -def test_safe_name(input_text, expected_output): - result = safe_name(input_text) - assert result == expected_output \ No newline at end of file +class TestBytesToReadable: + assert bytes_to_readable(0) == "0B" + assert bytes_to_readable(500) == "500.0 B" + assert bytes_to_readable(1024) == "1.0 KB" + assert bytes_to_readable(2048) == "2.0 KB" + assert bytes_to_readable(1536) == "1.5 KB" + assert bytes_to_readable(1024**2) == "1.0 MB" + assert bytes_to_readable(5 * 1024**2) == "5.0 MB" + assert bytes_to_readable(1024**3) == "1.0 GB" + assert bytes_to_readable(1024**4) == "1.0 TB" + assert bytes_to_readable(1024**5) == "1.0 PB" + assert bytes_to_readable(1024**6) == "1.0 EB" + assert bytes_to_readable(1024**7) == "1.0 ZB" + assert bytes_to_readable(1024**8) == "1.0 YB" + + +class TestReadableToBytes: + def test_conversion(self): + assert readable_to_bytes("100B") == 100 + assert readable_to_bytes("50 B") == 50 + assert readable_to_bytes("1KB") == 1024 + assert readable_to_bytes("2.5 KB") == 2560 + assert readable_to_bytes("2.0 KB") == 2048 + assert readable_to_bytes("1MB") == 1024**2 + assert readable_to_bytes("0.5 MB") == 524288 + assert readable_to_bytes("1. MB") == 1048576 + assert readable_to_bytes("1GB") == 1024**3 + assert readable_to_bytes("1.GB") == 1024**3 + assert readable_to_bytes("1TB") == 1024**4 + assert readable_to_bytes("1PB") == 1024**5 + assert readable_to_bytes("1EB") == 1024**6 + assert readable_to_bytes("1ZB") == 1024**7 + assert readable_to_bytes("1YB") == 1024**8 + + def test_case_insensitivity(self): + assert readable_to_bytes("1kb") == 1024 + assert readable_to_bytes("2mB") == 2 * 1024**2 + + def test_whitespace(self): + assert readable_to_bytes(" 10 KB ") == 10 * 1024 + assert readable_to_bytes(" 1 MB") == 1024**2 + + def test_invalid_unit(self): + with pytest.raises(ValueError, match="Invalid input for size_str"): + readable_to_bytes("100X") + readable_to_bytes("A100") + readable_to_bytes("100$$$$$") + + def test_invalid_number(self): + with pytest.raises(ValueError, match="Invalid input for size_str"): + readable_to_bytes("ABC KB") + + def test_missing_unit(self): + with pytest.raises(ValueError, match="Invalid input for size_str"): + readable_to_bytes("100") + + +class TestSanitizeExcept: + def test_no_tags(self): + html = "This is plain text." + assert sanitize_except(html) == Markup("This is plain text.") + + def test_allowed_br_tag(self): + html = "Line 1
Line 2" + assert sanitize_except(html) == Markup("Line 1
Line 2") + html = "
Line" + assert sanitize_except(html) == Markup("
Line") + html = "Line
" + assert sanitize_except(html) == Markup("Line
") + + def test_mixed_tags(self): + html = "Bold
Italic" + assert sanitize_except(html) == Markup( + "<b>Bold</b>
<i>Italic</i><img src='evil.gif'><script>alert('XSS')</script>") + + def test_attribute_stripping(self): + html = "
" + assert sanitize_except(html) == Markup("
") + + +class TestDetermineDay: + def test_same_day(self): + timestamp1 = 1678838400 # March 15, 2023 00:00:00 GMT + timestamp2 = 1678881600 # March 15, 2023 12:00:00 GMT + assert determine_day(timestamp1, timestamp2) is None + + def test_different_day(self): + timestamp1 = 1678886400 # March 15, 2023 00:00:00 GMT + timestamp2 = 1678972800 # March 16, 2023 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2023, 3, 16).date() + + def test_crossing_month(self): + timestamp1 = 1680220800 # March 31, 2023 00:00:00 GMT + timestamp2 = 1680307200 # April 1, 2023 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2023, 4, 1).date() + + def test_crossing_year(self): + timestamp1 = 1703980800 # December 31, 2023 00:00:00 GMT + timestamp2 = 1704067200 # January 1, 2024 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2024, 1, 1).date() + + +class TestGetFileName: + def test_valid_contact_phone_number_no_chat_name(self): + chat = ChatStore(Device.ANDROID, name=None) + filename, name = get_file_name("1234567890@s.whatsapp.net", chat) + assert filename == "1234567890" + assert name == "1234567890" + + def test_valid_contact_phone_number_with_chat_name(self): + chat = ChatStore(Device.IOS, name="My Chat Group") + filename, name = get_file_name("1234567890@s.whatsapp.net", chat) + assert filename == "1234567890-My-Chat-Group" + assert name == "My Chat Group" + + def test_valid_contact_exported_chat(self): + chat = ChatStore(Device.ANDROID, name="Testing") + filename, name = get_file_name("ExportedChat", chat) + assert filename == "ExportedChat-Testing" + assert name == "Testing" + + def test_valid_contact_special_ids(self): + chat = ChatStore(Device.ANDROID, name="Special Chat") + filename_000, name_000 = get_file_name("000000000000000", chat) + assert filename_000 == "000000000000000-Special-Chat" + assert name_000 == "Special Chat" + filename_001, name_001 = get_file_name("000000000000001", chat) + assert filename_001 == "000000000000001-Special-Chat" + assert name_001 == "Special Chat" + + def test_unexpected_contact_format(self): + chat = ChatStore(Device.ANDROID, name="Some Chat") + with pytest.raises(ValueError, match="Unexpected contact format: invalid-contact"): + get_file_name("invalid-contact", chat) + + def test_contact_with_hyphen_and_chat_name(self): + chat = ChatStore(Device.ANDROID, name="Another Chat") + filename, name = get_file_name("123-456-7890@g.us", chat) + assert filename == "Another-Chat" + assert name == "Another Chat" + + def test_contact_with_hyphen_no_chat_name(self): + chat = ChatStore(Device.ANDROID, name=None) + filename, name = get_file_name("123-456-7890@g.us", chat) + assert filename == "123-456-7890" + assert name == "123-456-7890" + + +class TestGetCondForEmpty: + def test_enable_true(self): + condition = get_cond_for_empty(True, "c.jid", "c.broadcast") + assert condition == "AND (chat.hidden=0 OR c.jid='status@broadcast' OR c.broadcast>0)" + + def test_enable_false(self): + condition = get_cond_for_empty(False, "other_jid", "other_broadcast") + assert condition == "" + + +class TestGetChatCondition: + ... + + +class TestGetStatusLocation: + @patch('os.path.isdir') + @patch('os.path.isfile') + @patch('os.mkdir') + @patch('urllib.request.urlopen') + @patch('builtins.open', new_callable=mock_open) + def test_offline_static_set(self, mock_open_file, mock_urlopen, mock_mkdir, mock_isfile, mock_isdir): + mock_isdir.return_value = False + mock_isfile.return_value = False + mock_response = MagicMock() + mock_response.read.return_value = b'W3.CSS Content' + mock_urlopen.return_value.__enter__.return_value = mock_response + output_folder = "output_folder" + offline_static = "offline_static" + + result = get_status_location(output_folder, offline_static) + + assert result == os.path.join(offline_static, "w3.css") + mock_mkdir.assert_called_once_with(os.path.join(output_folder, offline_static)) + mock_urlopen.assert_called_once_with("https://www.w3schools.com/w3css/4/w3.css") + mock_open_file.assert_called_once_with(os.path.join(output_folder, offline_static, "w3.css"), "wb") + mock_open_file().write.assert_called_once_with(b'W3.CSS Content') + + def test_offline_static_not_set(self): + result = get_status_location("output_folder", "") + assert result == "https://www.w3schools.com/w3css/4/w3.css" + + +class TestSafeName: + def generate_random_string(length=50): + random.seed(10) + return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length)) + + safe_name_test_cases = [ + ("This is a test string", "This-is-a-test-string"), + ("This is a test string with special characters!@#$%^&*()", + "This-is-a-test-string-with-special-characters"), + ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"), + ("This is a test string with mixed case ThisIsATestString", + "This-is-a-test-string-with-mixed-case-ThisIsATestString"), + ("This is a test string with extra spaces \u00A0 \u00A0 \u00A0 ThisIsATestString", + "This-is-a-test-string-with-extra-spaces-ThisIsATestString"), + ("This is a test string with unicode characters äöüß", + "This-is-a-test-string-with-unicode-characters-äöüß"), + ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is + ( + f"This is a test string with long length {generate_random_string(1000)}", + f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}", + ), + ("", ""), # Empty string + (" ", ""), # String with only space + ("---", "---"), # String with only hyphens + ("___", "___"), # String with only underscores + ("a" * 100, "a" * 100), # Long string with single character + ("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen + ("a_b_c_d_e", "a_b_c_d_e"), # String with underscore + ("a b c d e", "a-b-c-d-e"), # String with spaces + ("test.com/path/to/resource?param1=value1¶m2=value2", + "test.compathtoresourceparam1value1param2value2"), # Test with URL + ("filename.txt", "filename.txt"), # Test with filename + ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", + "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters + ("This is a test with комбинированные знаки ̆ example", + "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode + ] + + @pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases) + def test_safe_name(self, input_text, expected_output): + result = safe_name(input_text) + assert result == expected_output From e408c3141511eb878958da9e20dd6c4da7cc9a8f Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 17 May 2025 19:26:18 +0800 Subject: [PATCH 059/108] Fix: it is impossible to have 0.1 byte as byte is the smallest unit --- Whatsapp_Chat_Exporter/utility.py | 4 ++-- tests/test_utility.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 5d52d2d..268247a 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -85,8 +85,8 @@ def bytes_to_readable(size_bytes: int) -> str: Returns: A human-readable string representing the file size. """ - if size_bytes == 0: - return "0B" + if size_bytes < 1024: + return f"{size_bytes} B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) diff --git a/tests/test_utility.py b/tests/test_utility.py index 1e9012d..6a7c326 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -23,8 +23,8 @@ def test_convert_time_unit(): class TestBytesToReadable: - assert bytes_to_readable(0) == "0B" - assert bytes_to_readable(500) == "500.0 B" + assert bytes_to_readable(0) == "0 B" + assert bytes_to_readable(500) == "500 B" assert bytes_to_readable(1024) == "1.0 KB" assert bytes_to_readable(2048) == "2.0 KB" assert bytes_to_readable(1536) == "1.5 KB" @@ -40,6 +40,7 @@ class TestBytesToReadable: class TestReadableToBytes: def test_conversion(self): + assert readable_to_bytes("0B") == 0 assert readable_to_bytes("100B") == 100 assert readable_to_bytes("50 B") == 50 assert readable_to_bytes("1KB") == 1024 From 24653b875313ed0a8eaa434c31f6eefde8135c59 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 1 Jun 2025 11:53:45 +0800 Subject: [PATCH 060/108] Fixed integer input for --size not being casted to int #156 --- Whatsapp_Chat_Exporter/__main__.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index ff44531..1cdb5c4 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -332,13 +332,18 @@ def validate_args(parser: ArgumentParser, args) -> None: parser.error( "When --enrich-from-vcards is provided, you must also set --default-country-code") - # Size validation - if args.size is not None and not isinstance(args.size, int) and not args.size.isnumeric(): - try: - args.size = readable_to_bytes(args.size) - except ValueError: - parser.error( - "The value for --split must be ended in pure bytes or with a proper unit (e.g., 1048576 or 1MB)") + # Size validation and conversion + if args.size is not None: + stripped_size = args.size.strip() + if stripped_size.isnumeric(): + args.size = int(stripped_size) + else: + try: + args.size = readable_to_bytes(stripped_size) + except ValueError: + parser.error( + "The value for --split must be pure bytes or use a proper unit (e.g., 1048576 or 1MB)" + ) # Date filter validation and processing if args.filter_date is not None: From 706466f63b0595e46db73e192310662e63e96c24 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 1 Jun 2025 11:54:24 +0800 Subject: [PATCH 061/108] Enforce a tighter check on the input of size_str --- Whatsapp_Chat_Exporter/utility.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 268247a..c32e536 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -118,6 +118,9 @@ def readable_to_bytes(size_str: str) -> int: 'YB': 1024**8 } size_str = size_str.upper().strip() + match = re.fullmatch(r'^(\d+(\.\d+)?)\s*([KMGTPEZY]?B)?$', size_str) + if not match: + raise ValueError("Invalid size format. Expected format like '10MB', '1024GB', or '512'.") unit = ''.join(filter(str.isalpha, size_str)).strip() number = ''.join(c for c in size_str if c.isdigit() or c == '.').strip() if unit not in SIZE_UNITS: From 0ecfe6c59a2b8b175a5072eb71b6d4157667f7c6 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 1 Jun 2025 12:15:15 +0800 Subject: [PATCH 062/108] Cast numeric string in readable_to_bytes --- Whatsapp_Chat_Exporter/__main__.py | 16 ++++++---------- Whatsapp_Chat_Exporter/utility.py | 3 +++ tests/test_utility.py | 3 +-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 1cdb5c4..f561bcb 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -334,16 +334,12 @@ def validate_args(parser: ArgumentParser, args) -> None: # Size validation and conversion if args.size is not None: - stripped_size = args.size.strip() - if stripped_size.isnumeric(): - args.size = int(stripped_size) - else: - try: - args.size = readable_to_bytes(stripped_size) - except ValueError: - parser.error( - "The value for --split must be pure bytes or use a proper unit (e.g., 1048576 or 1MB)" - ) + try: + args.size = readable_to_bytes(args.size) + except ValueError: + parser.error( + "The value for --split must be pure bytes or use a proper unit (e.g., 1048576 or 1MB)" + ) # Date filter validation and processing if args.filter_date is not None: diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index c32e536..fc97d33 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -119,6 +119,9 @@ def readable_to_bytes(size_str: str) -> int: } size_str = size_str.upper().strip() match = re.fullmatch(r'^(\d+(\.\d+)?)\s*([KMGTPEZY]?B)?$', size_str) + if size_str.isnumeric(): + # If the string is purely numeric, assume it's in bytes + return int(size_str) if not match: raise ValueError("Invalid size format. Expected format like '10MB', '1024GB', or '512'.") unit = ''.join(filter(str.isalpha, size_str)).strip() diff --git a/tests/test_utility.py b/tests/test_utility.py index 6a7c326..c4d57f2 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -76,8 +76,7 @@ class TestReadableToBytes: readable_to_bytes("ABC KB") def test_missing_unit(self): - with pytest.raises(ValueError, match="Invalid input for size_str"): - readable_to_bytes("100") + assert readable_to_bytes("100") == 100 class TestSanitizeExcept: From f89f53cf2d55c409000ffc79ce7d6219afdcd279 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 1 Jun 2025 12:15:54 +0800 Subject: [PATCH 063/108] Fix test cases --- Whatsapp_Chat_Exporter/utility.py | 4 +--- tests/test_utility.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index fc97d33..04a0c67 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -123,11 +123,9 @@ def readable_to_bytes(size_str: str) -> int: # If the string is purely numeric, assume it's in bytes return int(size_str) if not match: - raise ValueError("Invalid size format. Expected format like '10MB', '1024GB', or '512'.") + raise ValueError("Invalid size format for size_str. Expected format like '10MB', '1024GB', or '512'.") unit = ''.join(filter(str.isalpha, size_str)).strip() number = ''.join(c for c in size_str if c.isdigit() or c == '.').strip() - if unit not in SIZE_UNITS: - raise ValueError("Invalid input for size_str. Example: 1024GB") return int(float(number) * SIZE_UNITS[unit]) diff --git a/tests/test_utility.py b/tests/test_utility.py index c4d57f2..8ea2af4 100644 --- a/tests/test_utility.py +++ b/tests/test_utility.py @@ -66,13 +66,13 @@ class TestReadableToBytes: assert readable_to_bytes(" 1 MB") == 1024**2 def test_invalid_unit(self): - with pytest.raises(ValueError, match="Invalid input for size_str"): + with pytest.raises(ValueError, match="Invalid size format for size_str"): readable_to_bytes("100X") readable_to_bytes("A100") readable_to_bytes("100$$$$$") def test_invalid_number(self): - with pytest.raises(ValueError, match="Invalid input for size_str"): + with pytest.raises(ValueError, match="Invalid size format for size_str"): readable_to_bytes("ABC KB") def test_missing_unit(self): From 99213503c4b3657c2b1f4a2558d754fe5aba5415 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 1 Jun 2025 12:17:21 +0800 Subject: [PATCH 064/108] Fix on incorrect rejection by the regex of the size_str String like '1. MB' should be accepted --- Whatsapp_Chat_Exporter/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 04a0c67..a39af16 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -118,10 +118,10 @@ def readable_to_bytes(size_str: str) -> int: 'YB': 1024**8 } size_str = size_str.upper().strip() - match = re.fullmatch(r'^(\d+(\.\d+)?)\s*([KMGTPEZY]?B)?$', size_str) if size_str.isnumeric(): # If the string is purely numeric, assume it's in bytes return int(size_str) + match = re.fullmatch(r'^(\d+(\.\d*)?)\s*([KMGTPEZY]?B)?$', size_str) if not match: raise ValueError("Invalid size format for size_str. Expected format like '10MB', '1024GB', or '512'.") unit = ''.join(filter(str.isalpha, size_str)).strip() From 5ed260b0b743778a2eaff35a47d72e2d7365e18c Mon Sep 17 00:00:00 2001 From: glemco <32201227+glemco@users.noreply.github.com> Date: Mon, 16 Jun 2025 11:45:58 +0200 Subject: [PATCH 065/108] Add support for telegram JSON file format Add the --telegram command line argument that, combined with a JSON output, generates a Telegram compatible JSON file [1]. The JSON is per-chat, so the --telegram argument implies the --json-per-chat setting. I took a few shortcuts: * Contact and Ids are inferred from the chat id or phone numbers * All text is marked as plain (e.g. no markup or different types) * Only personal chats and private groups supported * Private groups are defined if the chat has a name * Various ids try to match the ones in WA but may require bulk edits [1] - https://core.telegram.org/import-export Fixes: https://github.com/KnugiHK/WhatsApp-Chat-Exporter/issues/152 --- Whatsapp_Chat_Exporter/__main__.py | 13 +++++- Whatsapp_Chat_Exporter/utility.py | 66 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index f561bcb..05a84f9 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -15,6 +15,7 @@ from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update from Whatsapp_Chat_Exporter.utility import readable_to_bytes, safe_name, bytes_to_readable from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, DbType +from Whatsapp_Chat_Exporter.utility import telegram_json_format from argparse import ArgumentParser, SUPPRESS from datetime import datetime from getpass import getpass @@ -148,6 +149,10 @@ def setup_argument_parser() -> ArgumentParser: '--pretty-print-json', dest='pretty_print_json', default=None, nargs='?', const=2, type=int, help="Pretty print the output JSON." ) + json_group.add_argument( + "--telegram", dest="telegram", default=False, action='store_true', + help="Output the JSON in a format compatible with Telegram export (implies json-per-chat)" + ) json_group.add_argument( "--per-chat", dest="json_per_chat", default=False, action='store_true', help="Output the JSON file per chat" @@ -648,7 +653,7 @@ def export_json(args, data: ChatCollection, contact_store=None) -> None: data = {jik: chat.to_json() for jik, chat in data.items()} # Export as a single file or per chat - if not args.json_per_chat: + if not args.json_per_chat and not args.telegram: export_single_json(args, data) else: export_multiple_json(args, data) @@ -684,9 +689,13 @@ def export_multiple_json(args, data: Dict) -> None: else: contact = jik.replace('+', '') + if args.telegram: + obj = telegram_json_format(jik, data[jik]) + else: + obj = {jik: data[jik]} with open(f"{json_path}/{safe_name(contact)}.json", "w") as f: file_content = json.dumps( - {jik: data[jik]}, + obj, ensure_ascii=not args.avoid_encoding_json, indent=args.pretty_print_json ) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index a39af16..f53b88a 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -627,6 +627,72 @@ def safe_name(text: Union[str, bytes]) -> str: return "-".join(''.join(safe_chars).split()) +def get_from_string(msg: Dict, chat_id: str) -> str: + """Return the number or name for the sender""" + if msg["from_me"]: + return "Me" + if msg["sender"]: + return str(msg["sender"]) + return str(chat_id) + + +def get_from_id(msg: Dict, chat_id: str) -> str: + """Return the user id for the sender""" + if msg["from_me"]: + return "user00000" + if msg["sender"]: + return "user" + msg["sender"] + return f"user{chat_id}" + + +def get_reply_id(data: Dict, reply_key: str) -> Optional[str]: + """Get the id of the message corresponding to the reply""" + if not reply_key: + return None + for msg_id, msg in data["messages"].items(): + if msg["key_id"] == reply_key: + return int(msg_id) + return None + + +def telegram_json_format(jik: str, data: Dict) -> Dict: + """Convert the data to the Telegram export format""" + try: + chat_id = int(''.join([c for c in jik if c.isdigit()])) + except ValueError: + # not a real chat: e.g. statusbroadcast + chat_id = 0 + obj = { + "name": data["name"] if data["name"] else jik, + # TODO can we do better than this? + "type": "private_group" if data["name"] else "personal_chat", + "id": chat_id, + "messages": [ { + "id": int(msgId), + "type": "message", + "date": datetime.fromtimestamp(msg["timestamp"]).isoformat().split(".")[0], + "date_unixtime": int(msg["timestamp"]), + "from": get_from_string(msg, chat_id), + "from_id": get_from_id(msg, chat_id), + "reply_to_message_id": get_reply_id(data, msg["reply"]), + "text": msg["data"], + "text_entities": [ + { + # TODO this will lose formatting and different types + "type": "plain", + "text": msg["data"], + } + ], + } for msgId, msg in data["messages"].items()] + } + # remove empty messages and replies + for msg_id, msg in enumerate(obj["messages"]): + if not msg["reply_to_message_id"]: + del obj["messages"][msg_id]["reply_to_message_id"] + obj["messages"] = [m for m in obj["messages"] if m["text"]] + return obj + + class WhatsAppIdentifier(StrEnum): # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" From 4742ffd8582b5c60fd1a1a918370d13aca278c54 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 19 Jun 2025 00:10:31 +0800 Subject: [PATCH 066/108] Handle a permission error on macOS #158 Although this does not fix the issue, when the error occurs, it will provide more information to users --- Whatsapp_Chat_Exporter/ios_media_handler.py | 28 ++++++++++++++------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index a010c92..44575e0 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -5,7 +5,7 @@ import shutil import sqlite3 import os import getpass -from sys import exit +from sys import exit, platform as osname from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier from Whatsapp_Chat_Exporter.bplist import BPListReader try: @@ -46,15 +46,25 @@ class BackupExtractor: Returns: bool: True if encrypted, False otherwise. """ - with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as db: - c = db.cursor() - try: - c.execute("SELECT count() FROM Files") - c.fetchone() # Execute and fetch to trigger potential errors - except (sqlite3.OperationalError, sqlite3.DatabaseError): - return True + try: + with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as db: + c = db.cursor() + try: + c.execute("SELECT count() FROM Files") + c.fetchone() # Execute and fetch to trigger potential errors + except (sqlite3.OperationalError, sqlite3.DatabaseError): + return True + else: + return False + except sqlite3.DatabaseError as e: + if e == "authorization denied" and osname == "darwin": + logger.error( + "You don't have permission to access the backup database. Please" + "check your permissions or try moving the backup to somewhere else." + ) + exit(8) else: - return False + raise e def _extract_encrypted_backup(self): """ From 716d4af3f30dc28d5c476ecdf939ad65015cfd2a Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 19 Jun 2025 21:09:00 +0800 Subject: [PATCH 067/108] Fix incorrect type on comparison of exception --- Whatsapp_Chat_Exporter/ios_media_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index 44575e0..4416727 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -57,7 +57,7 @@ class BackupExtractor: else: return False except sqlite3.DatabaseError as e: - if e == "authorization denied" and osname == "darwin": + if str(e) == "authorization denied" and osname == "darwin": logger.error( "You don't have permission to access the backup database. Please" "check your permissions or try moving the backup to somewhere else." From 5aa12482e03b701e30f5dca4ab1230b9f9c9022f Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 19 Jun 2025 21:22:20 +0800 Subject: [PATCH 068/108] Fix on disappearing reply feature in iOS #154 --- Whatsapp_Chat_Exporter/ios_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 3c40202..27aae7c 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -244,7 +244,7 @@ def process_message_data(message, content, is_group_message, data, cursor2): return process_metadata_message(message, content, is_group_message) # Handle quoted replies - if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14") and False: + if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14"): quoted = content["ZMETADATA"][2:19] message.reply = quoted.decode() cursor2.execute(f"""SELECT ZTEXT From 9d3e65bd922f998b251c143ad662e61d80d8fe72 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 19 Jun 2025 21:41:03 +0800 Subject: [PATCH 069/108] Fix error when using not supplying a value (default) to --size --- Whatsapp_Chat_Exporter/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index f561bcb..54a90a0 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -134,7 +134,7 @@ def setup_argument_parser() -> ArgumentParser: help="Do not output html files" ) output_group.add_argument( - "--size", "--output-size", "--split", dest="size", nargs='?', const=0, default=None, + "--size", "--output-size", "--split", dest="size", nargs='?', const="0", default=None, help="Maximum (rough) size of a single output file in bytes, 0 for auto" ) From 265afc1312153b471760bdc7490e7edb6935081d Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Thu, 19 Jun 2025 22:00:26 +0800 Subject: [PATCH 070/108] Implement (blue) ticks for message status #146 --- Whatsapp_Chat_Exporter/whatsapp_new.html | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Whatsapp_Chat_Exporter/whatsapp_new.html b/Whatsapp_Chat_Exporter/whatsapp_new.html index 2aa2b7e..dd477f1 100644 --- a/Whatsapp_Chat_Exporter/whatsapp_new.html +++ b/Whatsapp_Chat_Exporter/whatsapp_new.html @@ -127,6 +127,15 @@ --tw-translate-x: -50%; transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y)); } + .status-indicator { + display: inline-block; + margin-left: 4px; + font-size: 0.8em; + color: #8c8c8c; + } + .status-indicator.read { + color: #34B7F1; + } - + - - + + .search-icon { + width: 20px; + height: 20px; + position: relative; + display: inline-block; + } + + .search-icon::before { + content: ''; + position: absolute; + width: 12px; + height: 12px; + border: 2px solid #aebac1; + border-radius: 50%; + top: 2px; + left: 2px; + } + + .search-icon::after { + content: ''; + position: absolute; + width: 2px; + height: 6px; + background: #aebac1; + transform: rotate(45deg); + top: 12px; + left: 12px; + } + + .arrow-left { + width: 0; + height: 0; + border-top: 6px solid transparent; + border-bottom: 6px solid transparent; + border-right: 8px solid #aebac1; + display: inline-block; + } + + .arrow-right { + width: 0; + height: 0; + border-top: 6px solid transparent; + border-bottom: 6px solid transparent; + border-left: 8px solid #aebac1; + display: inline-block; + } + + .info-icon { + width: 20px; + height: 20px; + border: 2px solid currentColor; + border-radius: 50%; + position: relative; + display: inline-block; + } + + .info-icon::before { + content: 'i'; + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + font-size: 12px; + font-weight: bold; + font-style: normal; + } + + - - -
-
-
-
- {% if not no_avatar %} -
- {% if their_avatar is not none %} - - {% else %} - - {% endif %} -
- {% endif %} -
-

{{ headline }}

- {% if status is not none %}

{{ status }}

{% endif %} -
-
-
- - - {% if previous %} - - - - - - {% endif %} - {% if next %} - - - - - + + + +
+
+
+
+ {% if not no_avatar %} +
+ {% if their_avatar is not none %} + + {% else %} + {% endif %}
- -
- - + {% endif %} +
+

{{ headline }}

+ {% if status is not none %}

{{ status }}

{% endif %}
+
+ + + {% if previous %} + + + + {% endif %} + {% if next %} + + + + {% endif %} +
+ +
+ + +
-
-
- - {% set last = {'last': 946688461.001} %} - {% for msg in msgs -%} - {% if determine_day(last.last, msg.timestamp) is not none %} -
-
- {{ determine_day(last.last, msg.timestamp) }} -
-
- {% if last.update({'last': msg.timestamp}) %}{% endif %} - {% endif %} - - {% if msg.from_me == true %} -
-
-
-
- - - - +
+
+
+ + {% set last = {'last': 946688461.001} %} + {% for msg in msgs -%} + {% if determine_day(last.last, msg.timestamp) is not none %} +
+
+ {{ determine_day(last.last, msg.timestamp) }} +
+
+ {% if last.update({'last': msg.timestamp}) %}{% endif %} + {% endif %} + + {% if msg.from_me == true %} +
+
+
+
+ +
-
- {% if msg.reply is not none %} - -
-
-
-

Replying to

-

- {% if msg.quoted_data is not none %} - "{{msg.quoted_data}}" - {% else %} - this message - {% endif %} -

-
- {% set replied_msg = msgs | selectattr('key_id', 'equalto', msg.reply) | first %} - {% if replied_msg and replied_msg.media == true %} -
- {% if "image/" in replied_msg.mime %} - - {% elif "video/" in replied_msg.mime %} -
- -
-
-
-
- {% elif "audio/" in replied_msg.mime %} -
-
-
- {% endif %} -
- {% endif %} -
-
-
- {% endif %} -

- {% if msg.meta == true or msg.media == false and msg.data is none %} -

-
- {% if msg.safe %} - {{ msg.data | safe or 'Not supported WhatsApp internal message' }} - {% else %} - {{ msg.data or 'Not supported WhatsApp internal message' }} - {% endif %} -
-
- {% if msg.caption is not none %} -

{{ msg.caption | urlize(none, true, '_blank') }}

- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - The file cannot be displayed here, however it should be located at here - {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} - {{ msg.caption | urlize(none, true, '_blank') }} - {% endif %} - {% endif %} - {% endif %} -

-

{{ msg.time }} - - {% if msg.received_timestamp %} - ✓✓ - {% else %} - ✓ - {% endif %} - -

-
- {% else %} -
-
- {% if msg.reply is not none %} - -
-
-
-

Replying to

-

- {% if msg.quoted_data is not none %} - {{msg.quoted_data}} - {% else %} - this message - {% endif %} -

-
- {% set replied_msg = msgs | selectattr('key_id', 'equalto', msg.reply) | first %} - {% if replied_msg and replied_msg.media == true %} -
- {% if "image/" in replied_msg.mime %} - - {% elif "video/" in replied_msg.mime %} -
- -
-
-
-
- {% elif "audio/" in replied_msg.mime %} -
-
-
- {% endif %} -
- {% endif %} -
-
-
- {% endif %} -

- {% if msg.meta == true or msg.media == false and msg.data is none %} -

-
- {% if msg.safe %} - {{ msg.data | safe or 'Not supported WhatsApp internal message' }} - {% else %} - {{ msg.data or 'Not supported WhatsApp internal message' }} - {% endif %} -
-
- {% if msg.caption is not none %} -

{{ msg.caption | urlize(none, true, '_blank') }}

- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - The file cannot be displayed here, however it should be located at here +
+ -
-

+ {% endif %} + {% endfor %} +

+ - - - - - - -
-
- - - + - + + + \ No newline at end of file From 177b936b25e268cd538b4eb19492af2dca962ce6 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sun, 27 Jul 2025 16:28:28 +0800 Subject: [PATCH 077/108] Give styling to "End of history" --- Whatsapp_Chat_Exporter/whatsapp_new.html | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Whatsapp_Chat_Exporter/whatsapp_new.html b/Whatsapp_Chat_Exporter/whatsapp_new.html index 538de74..3599065 100644 --- a/Whatsapp_Chat_Exporter/whatsapp_new.html +++ b/Whatsapp_Chat_Exporter/whatsapp_new.html @@ -587,11 +587,13 @@ {% endfor %}