diff --git a/.github/workflows/compile-binary.yml b/.github/workflows/compile-binary.yml index 4eca4a0..be1b723 100644 --- a/.github/workflows/compile-binary.yml +++ b/.github/workflows/compile-binary.yml @@ -16,11 +16,11 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.3 pip install . - name: Build binary with Nuitka run: | @@ -40,11 +40,11 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.3 pip install . - name: Build binary with Nuitka run: | @@ -64,11 +64,11 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.10' + python-version: '3.12' - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.3 pip install . - name: Build binary with Nuitka run: | diff --git a/.gitignore b/.gitignore index dd82d78..5831f34 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,14 @@ dmypy.json *.onefile-build/ *.exe __main__ + + +# Dev time intermidiates & temp files +result/ +WhatsApp/ +/*.db +/*.db-* +/myout +/msgstore.db +/myout-json +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index a5c6b10..2fd2b2f 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,13 @@ Simply invoke the following command from shell. ```sh wtsexporter -a ``` +#### Enriching Contact from vCard +Usually, the default WhatsApp contact database extracted from your phone will contains the contact names and the exporter will use it to map your chats. However, some reported cases showed that the database could has never been populated. +In this case, you can export your contacts to a vCard file from your phone or a cloud provider like Google Contacts. Then, install the necessary dependency and run the following command from the shell: +```sh +pip install whatsapp-chat-exporter["vcards"] +wtsexporter -a --enrich-from-vcard contacts.vcf --default-country-code 852 +``` ### Encrypted Android WhatsApp Backup In order to support the decryption, install pycryptodome if it is not installed diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index ae953b5..3b3eed8 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -7,10 +7,17 @@ import shutil import json import string import glob +try: + import vobject +except ModuleNotFoundError: + vcards_deps_installed = False +else: + vcards_deps_installed = True from Whatsapp_Chat_Exporter import exported_handler, android_handler from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler +from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards from Whatsapp_Chat_Exporter.data_model import ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, DbType +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, DbType, chat_is_empty from Whatsapp_Chat_Exporter.utility import check_update, import_from_json from argparse import ArgumentParser, SUPPRESS from datetime import datetime @@ -85,6 +92,20 @@ def main(): type=str, const="result.json", help="Save the result to a single JSON file (default if present: result.json)") + parser.add_argument( + '--avoid-encoding-json', + dest='avoid_encoding_json', + default=False, + action='store_true', + help="Don't encode non-ascii characters in the output JSON files") + parser.add_argument( + '--pretty-print-json', + dest='pretty_print_json', + default=None, + nargs='?', + const=2, + type=int, + help="Pretty print the output JSON.") parser.add_argument( '-d', '--db', @@ -232,6 +253,13 @@ def main(): metavar="phone number", help="Exclude chats that match the supplied phone number" ) + parser.add_argument( + "--dont-filter-empty", + dest="filter_empty", + default=True, + action='store_false', + help="By default, the exporter will not render chats with no valid message. Setting this flag will cause the exporter to render those." + ) parser.add_argument( "--per-chat", dest="json_per_chat", @@ -253,6 +281,19 @@ def main(): type=int, help="Specify the chunk size for decrypting iOS backup, which may affect the decryption speed." ) + parser.add_argument( + "--enrich-from-vcards", + dest="enrich_from_vcards", + default=None, + help="Path to an exported vcf file from Google contacts export. Add names missing from WhatsApp's default database" + ) + parser.add_argument( + "--default-country-code", + dest="default_contry_code", + default=None, + help="Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country" + ) + args = parser.parse_args() # Check for updates @@ -277,6 +318,8 @@ def main(): (args.json[-5:] == ".json" and os.path.isfile(args.json[:-5])) ): parser.error("When --per-chat is enabled, the destination of --json must be a directory.") + if args.enrich_from_vcards is not None and args.default_contry_code is None: + parser.error("When --enrich-from-vcards is provided, you must also set --default-country-code") if args.filter_date is not None: if " - " in args.filter_date: start, end = args.filter_date.split(" - ") @@ -320,6 +363,17 @@ def main(): data = {} + contact_store = ContactsFromVCards() + + if args.enrich_from_vcards is not None: + if not vcards_deps_installed: + parser.error( + "You don't have the dependency to enrich contacts with vCard.\n" + "Read more on how to deal with enriching contacts:\n" + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" + ) + contact_store.load_vcf_file(args.enrich_from_vcards, args.default_contry_code) + if args.android: contacts = android_handler.contacts messages = android_handler.messages @@ -429,6 +483,9 @@ def main(): if args.android: android_handler.calls(db, data, args.timezone_offset, filter_chat) if not args.no_html: + if not contact_store.is_empty(): + contact_store.enrich_from_vcards(data) + create_html( data, args.output, @@ -436,7 +493,8 @@ def main(): args.embedded, args.offline, args.size, - args.no_avatar + args.no_avatar, + args.filter_empty ) else: print( @@ -471,7 +529,9 @@ def main(): args.template, args.embedded, args.offline, - args.size + args.size, + args.no_avatar, + args.filter_empty ) for file in glob.glob(r'*.*'): shutil.copy(file, args.output) @@ -483,15 +543,28 @@ def main(): args.template, args.embedded, args.offline, - args.size + args.size, + args.no_avatar, + args.filter_empty ) if args.json and not args.import_json: + if args.filter_empty: + data = {k: v for k, v in data.items() if not chat_is_empty(v)} + + if not contact_store.is_empty(): + contact_store.enrich_from_vcards(data) + if isinstance(data[next(iter(data))], ChatStore): data = {jik: chat.to_json() for jik, chat in data.items()} + if not args.json_per_chat: with open(args.json, "w") as f: - data = json.dumps(data) + data = json.dumps( + data, + ensure_ascii=not args.avoid_encoding_json, + indent=args.pretty_print_json + ) print(f"\nWriting JSON file...({int(len(data)/1024/1024)}MB)") f.write(data) else: @@ -506,7 +579,8 @@ def main(): else: contact = jik.replace('+', '') with open(f"{args.json}/{contact}.json", "w") as f: - f.write(json.dumps(data[jik])) + file_content_to_write = json.dumps(data[jik], ensure_ascii=not args.avoid_encoding_json, indent=2 if args.pretty_print_json else None) + f.write(file_content_to_write) print(f"Writing JSON file...({index + 1}/{total})", end="\r") print() else: diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index 2024112..32c2121 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -11,7 +11,7 @@ from markupsafe import escape as htmle from hashlib import sha256 from base64 import b64decode, b64encode from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import MAX_SIZE, ROW_SIZE, DbType, determine_metadata, JidType +from Whatsapp_Chat_Exporter.utility import MAX_SIZE, ROW_SIZE, DbType, determine_metadata, JidType, chat_is_empty from Whatsapp_Chat_Exporter.utility import rendering, Crypt, Device, get_file_name, setup_template from Whatsapp_Chat_Exporter.utility import brute_force_offset, CRYPT14_OFFSETS, get_status_location from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify @@ -157,7 +157,11 @@ def contacts(db, data): c = db.cursor() c.execute("""SELECT count() FROM wa_contacts""") total_row_number = c.fetchone()[0] - print(f"Processing contacts...({total_row_number})") + if total_row_number == 0: + print("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") + return False + else: + print(f"Processing contacts...({total_row_number})") c.execute("""SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts; """) row = c.fetchone() @@ -745,7 +749,8 @@ def create_html( embedded=False, offline_static=False, maximum_size=None, - no_avatar=False + no_avatar=False, + filter_empty=True ): template = setup_template(template, no_avatar) @@ -759,7 +764,7 @@ def create_html( for current, contact in enumerate(data): chat = data[contact] - if len(chat.messages) == 0: + if filter_empty and chat_is_empty(chat): continue safe_file_name, name = get_file_name(contact, chat) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 16a81de..75adaf3 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -168,6 +168,12 @@ def get_chat_condition(filter, include, column): else: return "" +def _is_message_empty(message): + return (message.data is None or message.data == "") and not message.media + +def chat_is_empty(chat: ChatStore): + return len(chat.messages) == 0 or all(_is_message_empty(message) for message in chat.messages.values()) + # Android Specific CRYPT14_OFFSETS = ( diff --git a/Whatsapp_Chat_Exporter/vcards_contacts.py b/Whatsapp_Chat_Exporter/vcards_contacts.py new file mode 100644 index 0000000..872f619 --- /dev/null +++ b/Whatsapp_Chat_Exporter/vcards_contacts.py @@ -0,0 +1,78 @@ +import itertools +from typing import List, TypedDict +import vobject + + +class ExportedContactNumbers(TypedDict): + full_name: str + numbers: List[str] + + +class ContactsFromVCards: + def __init__(self) -> None: + self.contact_mapping = [] + + def is_empty(self): + return self.contact_mapping == [] + + def load_vcf_file(self, vcf_file_path: str, default_country_code: str): + self.contact_mapping = read_vcards_file(vcf_file_path, default_country_code) + + def enrich_from_vcards(self, chats): + for number, name in self.contact_mapping: + # short number must be a bad contact, lets skip it + if len(number) <= 5: + continue + + for chat in filter_chats_by_prefix(chats, number).values(): + if not hasattr(chat, 'name') or (hasattr(chat, 'name') and chat.name is None): + setattr(chat, 'name', name) + + +def read_vcards_file(vcf_file_path, default_country_code: str): + contacts = [] + with open(vcf_file_path, mode="r", encoding="utf-8") as f: + reader = vobject.readComponents(f) + for row in reader: + if not hasattr(row, 'fn') or not hasattr(row, 'tel'): + continue + + contact: ExportedContactNumbers = { + "full_name": row.fn.value, + "numbers": list(map(lambda tel: tel.value, row.tel_list)), + } + contacts.append(contact) + + return map_number_to_name(contacts, default_country_code) + + +def filter_chats_by_prefix(chats, prefix: str): + return {k: v for k, v in chats.items() if k.startswith(prefix)} + + +def map_number_to_name(contacts, default_country_code: str): + mapping = [] + for contact in contacts: + for index, num in enumerate(contact['numbers']): + normalized = normalize_number(num, default_country_code) + if len(contact['numbers']) > 1: + name = f"{contact['full_name']} ({index+1})" + else: + name = contact['full_name'] + mapping.append((normalized, name)) + return mapping + + +def normalize_number(number: str, country_code: str): + # Clean the number + number = ''.join(c for c in number if c.isdigit() or c == "+") + + # A number that starts with a + or 00 means it already have a country code + for starting_char in ('+', "00"): + if number.startswith(starting_char): + return number[len(starting_char):] + + # leading zero should be removed + if starting_char == '0': + number = number[1:] + return country_code + number # fall back diff --git a/Whatsapp_Chat_Exporter/vcards_contacts_test.py b/Whatsapp_Chat_Exporter/vcards_contacts_test.py new file mode 100644 index 0000000..194b637 --- /dev/null +++ b/Whatsapp_Chat_Exporter/vcards_contacts_test.py @@ -0,0 +1,20 @@ +# from contacts_names_from_vcards import readVCardsFile + +from Whatsapp_Chat_Exporter.vcards_contacts import normalize_number, read_vcards_file + + +def test_readVCardsFile(): + assert len(read_vcards_file("contacts.vcf", "973")) > 0 + +def test_create_number_to_name_dicts(): + pass + +def test_fuzzy_match_numbers(): + pass + +def test_normalize_number(): + assert normalize_number('0531234567', '1') == '1531234567' + assert normalize_number('001531234567', '2') == '1531234567' + assert normalize_number('+1531234567', '34') == '1531234567' + assert normalize_number('053(123)4567', '34') == '34531234567' + assert normalize_number('0531-234-567', '58') == '58531234567' diff --git a/setup.py b/setup.py index 98429ac..5fa5484 100644 --- a/setup.py +++ b/setup.py @@ -55,9 +55,10 @@ setuptools.setup( 'crypt12': ["pycryptodome"], 'crypt14': ["pycryptodome"], 'crypt15': ["pycryptodome", "javaobj-py3"], - 'all': ["pycryptodome", "javaobj-py3"], - 'everything': ["pycryptodome", "javaobj-py3"], - 'backup': ["pycryptodome", "javaobj-py3"] + 'all': ["pycryptodome", "javaobj-py3", "vobject"], + 'everything': ["pycryptodome", "javaobj-py3", "vobject"], + 'backup': ["pycryptodome", "javaobj-py3"], + 'vcards': ["vobject", "pycryptodome", "javaobj-py3"], }, entry_points={ "console_scripts": [