From fb88124d2125f80b6b9913bb6d128e6d38162b89 Mon Sep 17 00:00:00 2001 From: KnugiHK <24708955+KnugiHK@users.noreply.github.com> Date: Sat, 25 Mar 2023 11:51:27 +0800 Subject: [PATCH] Merge old and new schema processing logic --- Whatsapp_Chat_Exporter/__main__.py | 4 +- Whatsapp_Chat_Exporter/extract.py | 341 +++++++++++++++++++---------- 2 files changed, 227 insertions(+), 118 deletions(-) diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 2f08776..38c74da 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -2,11 +2,11 @@ try: from .__init__ import __version__ except ImportError: from Whatsapp_Chat_Exporter.__init__ import __version__ -from Whatsapp_Chat_Exporter import extract_new as extract +from Whatsapp_Chat_Exporter import extract from Whatsapp_Chat_Exporter import extract_iphone from Whatsapp_Chat_Exporter import extract_iphone_media from Whatsapp_Chat_Exporter.data_model import ChatStore -from Whatsapp_Chat_Exporter.extract_new import Crypt +from Whatsapp_Chat_Exporter.extract import Crypt from argparse import ArgumentParser import os import sqlite3 diff --git a/Whatsapp_Chat_Exporter/extract.py b/Whatsapp_Chat_Exporter/extract.py index d0f8d14..011309e 100644 --- a/Whatsapp_Chat_Exporter/extract.py +++ b/Whatsapp_Chat_Exporter/extract.py @@ -15,6 +15,7 @@ from datetime import datetime from enum import Enum from mimetypes import MimeTypes from hashlib import sha256 +from Whatsapp_Chat_Exporter.data_model import ChatStore, Message try: import zlib @@ -92,6 +93,7 @@ def decrypt_backup(database, key, output, crypt=Crypt.CRYPT14, show_crypt15=Fals t1 = key[30:62] if crypt is not Crypt.CRYPT15 and len(key) != 158: raise ValueError("The key file must be 158 bytes") + # Determine the IV and database offsets if crypt == Crypt.CRYPT14: if len(database) < 191: raise ValueError("The crypt14 file must be at least 191 bytes") @@ -195,150 +197,215 @@ def contacts(db, data): def messages(db, data): # Get message history c = db.cursor() - c.execute("""SELECT count() FROM messages""") + try: + c.execute("""SELECT count() FROM messages""") + except sqlite3.OperationalError: + c.execute("""SELECT count() FROM message""") total_row_number = c.fetchone()[0] print(f"Gathering messages...(0/{total_row_number})", end="\r") phone_number_re = re.compile(r"[0-9]+@s.whatsapp.net") - c.execute("""SELECT messages.key_remote_jid, - messages._id, - messages.key_from_me, - messages.timestamp, - messages.data, - messages.status, - messages.edit_version, - messages.thumb_image, - messages.remote_resource, - messages.media_wa_type, - messages.latitude, - messages.longitude, - messages_quotes.key_id as quoted, - messages.key_id, - messages_quotes.data, - messages.media_caption - FROM messages - LEFT JOIN messages_quotes - ON messages.quoted_row_id = messages_quotes._id - WHERE messages.key_remote_jid <> '-1';""") + try: + c.execute("""SELECT messages.key_remote_jid, + messages._id, + messages.key_from_me, + messages.timestamp, + messages.data, + messages.status, + messages.edit_version, + messages.thumb_image, + messages.remote_resource, + messages.media_wa_type, + messages.latitude, + messages.longitude, + messages_quotes.key_id as quoted, + messages.key_id, + messages_quotes.data as quoted_data, + messages.media_caption + FROM messages + LEFT JOIN messages_quotes + ON messages.quoted_row_id = messages_quotes._id + WHERE messages.key_remote_jid <> '-1';""" + ) + except sqlite3.OperationalError: + try: + c.execute("""SELECT jid_global.raw_string as key_remote_jid, + message._id, + message.from_me as key_from_me, + message.timestamp, + message.text_data as data, + message.status, + message_future.version as edit_version, + message_thumbnail.thumbnail as thumb_image, + message_media.file_path as remote_resource, + message_media.mime_type as media_wa_type, + message_location.latitude, + message_location.longitude, + message_quoted.key_id as quoted, + message.key_id, + message_quoted.text_data as quoted_data, + message.message_type, + jid_group.raw_string as group_sender_jid, + chat.subject as chat_subject + FROM message + LEFT JOIN message_quoted + ON message_quoted.message_row_id = message._id + LEFT JOIN message_location + ON message_location.message_row_id = message._id + LEFT JOIN message_media + ON message_media.message_row_id = message._id + LEFT JOIN message_thumbnail + ON message_thumbnail.message_row_id = message._id + LEFT JOIN message_future + ON message_future.message_row_id = message._id + LEFT JOIN chat + ON chat._id = message.chat_row_id + INNER JOIN jid jid_global + ON jid_global._id = chat.jid_row_id + LEFT JOIN jid jid_group + ON jid_group._id = message.sender_jid_row_id + WHERE key_remote_jid <> '-1';""" + ) + except: + ... + else: + table_message = True + else: + table_message = False i = 0 content = c.fetchone() while content is not None: - if content[0] not in data: - data[content[0]] = {"name": None, "messages": {}} - data[content[0]]["messages"][content[1]] = { - "from_me": bool(content[2]), - "timestamp": content[3]/1000, - "time": datetime.fromtimestamp(content[3]/1000).strftime("%H:%M"), - "media": False, - "key_id": content[13], - "meta": False, - "data": None - } - if "-" in content[0] and content[2] == 0: + if content["key_remote_jid"] not in data: + data[content["key_remote_jid"]] = ChatStore() + if content["key_remote_jid"] is None: + continue # Not sure + data[content["key_remote_jid"]].add_message(content["_id"], Message( + from_me=content["key_from_me"], + timestamp=content["timestamp"], + time=content["timestamp"], + key_id=content["key_id"], + )) + if "-" in content["key_remote_jid"] and content["key_from_me"] == 0: name = None - if content[8] in data: - name = data[content[8]]["name"] - if "@" in content[8]: - fallback = content[8].split('@')[0] + if table_message: + if content["chat_subject"] is not None: + _jid = content["group_sender_jid"] + else: + _jid = content["key_remote_jid"] + if _jid in data: + name = data[_jid].name + fallback = _jid.split('@')[0] if "@" in _jid else None else: fallback = None else: - fallback = None - - data[content[0]]["messages"][content[1]]["sender"] = name or fallback - else: - data[content[0]]["messages"][content[1]]["sender"] = None - - if content[12] is not None: - data[content[0]]["messages"][content[1]]["reply"] = content[12] - data[content[0]]["messages"][content[1]]["quoted_data"] = content[14] - else: - data[content[0]]["messages"][content[1]]["reply"] = None - - if content[15] is not None: - data[content[0]]["messages"][content[1]]["caption"] = content[15] - else: - data[content[0]]["messages"][content[1]]["caption"] = None - - if content[5] == 6: - if "-" in content[0]: - # Is Group - if content[4] is not None: - try: - int(content[4]) - except ValueError: - msg = f"The group name changed to {content[4]}" - data[content[0]]["messages"][content[1]]["data"] = msg - data[content[0]]["messages"][content[1]]["meta"] = True + if content["remote_resource"] in data: + name = data[content["remote_resource"]].name + if "@" in content["remote_resource"]: + fallback = content["remote_resource"].split('@')[0] else: - del data[content[0]]["messages"][content[1]] + fallback = None else: - thumb_image = content[7] + fallback = None + + data[content["key_remote_jid"]].messages[content["_id"]].sender = name or fallback + else: + data[content["key_remote_jid"]].messages[content["_id"]].sender = None + + if content["quoted"] is not None: + data[content["key_remote_jid"]].messages[content["_id"]].reply = content["quoted"] + data[content["key_remote_jid"]].messages[content["_id"]].quoted_data = content["quoted_data"] + else: + data[content["key_remote_jid"]].messages[content["_id"]].reply = None + + if not table_message and content["media_caption"] is not None: + # Old schema + data[content["key_remote_jid"]].messages[content["_id"]].caption = content["media_caption"] + elif table_message and content["message_type"] == 1 and content["data"] is not None: + # New schema + data[content["key_remote_jid"]].messages[content["_id"]].caption = content["data"] + else: + data[content["key_remote_jid"]].messages[content["_id"]].caption = None + + if content["status"] == 6: + if (not table_message and "-" in content["key_remote_jid"]) or \ + (table_message and content["chat_subject"] is not None): + # Is Group + if content["data"] is not None: + try: + int(content["data"]) + except ValueError: + msg = f"The group name changed to {content['data']}" + data[content["key_remote_jid"]].messages[content["_id"]].data = msg + data[content["key_remote_jid"]].messages[content["_id"]].meta = True + else: + data[content["key_remote_jid"]].delete_message(content["_id"]) + else: + thumb_image = content["thumb_image"] if thumb_image is not None: if b"\x00\x00\x01\x74\x00\x1A" in thumb_image: # Add user added = phone_number_re.search( thumb_image.decode("unicode_escape"))[0] if added in data: - name_right = data[added]["name"] + name_right = data[added].name else: name_right = added.split('@')[0] - if content[8] is not None: - if content[8] in data: - name_left = data[content[8]]["name"] + if content["remote_resource"] is not None: + if content["remote_resource"] in data: + name_left = data[content["remote_resource"]].name else: - name_left = content[8].split('@')[0] + name_left = content["remote_resource"].split('@')[0] msg = f"{name_left} added {name_right or 'You'}" else: msg = f"Added {name_right or 'You'}" elif b"\xac\xed\x00\x05\x74\x00" in thumb_image: # Changed number - original = content[8].split('@')[0] + original = content["remote_resource"].split('@')[0] changed = thumb_image[7:].decode().split('@')[0] msg = f"{original} changed to {changed}" - data[content[0]]["messages"][content[1]]["data"] = msg - data[content[0]]["messages"][content[1]]["meta"] = True + data[content["key_remote_jid"]].messages[content["_id"]].data = msg + data[content["key_remote_jid"]].messages[content["_id"]].meta = True else: - if content[4] is None: - del data[content[0]]["messages"][content[1]] + if content["data"] is None: + data[content["key_remote_jid"]].delete_message(content["_id"]) else: # Private chat - if content[4] is None and content[7] is None: - del data[content[0]]["messages"][content[1]] + if content["data"] is None and content["thumb_image"] is None: + data[content["key_remote_jid"]].delete_message(content["_id"]) else: - if content[2] == 1: - if content[5] == 5 and content[6] == 7: + if content["key_from_me"] == 1: + if content["status"] == 5 and content["edit_version"] == 7: msg = "Message deleted" - data[content[0]]["messages"][content[1]]["meta"] = True + data[content["key_remote_jid"]].messages[content["_id"]].meta = True else: - if content[9] == "5": - msg = f"Location shared: {content[10], content[11]}" - data[content[0]]["messages"][content[1]]["meta"] = True + if content["media_wa_type"] == "5": + msg = f"Location shared: {content['latitude'], content['longitude']}" + data[content["key_remote_jid"]].messages[content["_id"]].meta = True else: - msg = content[4] + msg = content["data"] if msg is not None: if "\r\n" in msg: msg = msg.replace("\r\n", "
") if "\n" in msg: msg = msg.replace("\n", "
") else: - if content[5] == 0 and content[6] == 7: + if content["status"] == 0 and content["edit_version"] == 7: msg = "Message deleted" - data[content[0]]["messages"][content[1]]["meta"] = True + data[content["key_remote_jid"]].messages[content["_id"]].meta = True else: - if content[9] == "5": - msg = f"Location shared: {content[10], content[11]}" - data[content[0]]["messages"][content[1]]["meta"] = True + if content["media_wa_type"] == "5": + msg = f"Location shared: {content['latitude'], content['longitude']}" + data[content["key_remote_jid"]].messages[content["_id"]].meta = True else: - msg = content[4] + msg = content["data"] if msg is not None: if "\r\n" in msg: msg = msg.replace("\r\n", "
") if "\n" in msg: msg = msg.replace("\n", "
") - data[content[0]]["messages"][content[1]]["data"] = msg + data[content["key_remote_jid"]].messages[content["_id"]].data = msg i += 1 if i % 1000 == 0: @@ -354,7 +421,8 @@ def media(db, data, media_folder): total_row_number = c.fetchone()[0] print(f"\nGathering media...(0/{total_row_number})", end="\r") i = 0 - c.execute("""SELECT messages.key_remote_jid, + try: + c.execute("""SELECT messages.key_remote_jid, message_row_id, file_path, message_url, @@ -363,22 +431,39 @@ def media(db, data, media_folder): FROM message_media INNER JOIN messages ON message_media.message_row_id = messages._id - ORDER BY messages.key_remote_jid ASC""") + ORDER BY messages.key_remote_jid ASC""" + ) + except sqlite3.OperationalError: + c.execute("""SELECT jid.raw_string as key_remote_jid, + message_row_id, + file_path, + message_url, + mime_type, + media_key + FROM message_media + INNER JOIN message + ON message_media.message_row_id = message._id + LEFT JOIN chat + ON chat._id = message.chat_row_id + INNER JOIN jid + ON jid._id = chat.jid_row_id + ORDER BY jid.raw_string ASC""" + ) content = c.fetchone() mime = MimeTypes() while content is not None: - file_path = f"{media_folder}/{content[2]}" - data[content[0]]["messages"][content[1]]["media"] = True + file_path = f"{media_folder}/{content['file_path']}" + data[content["key_remote_jid"]].messages[content["message_row_id"]].media = True if os.path.isfile(file_path): - data[content[0]]["messages"][content[1]]["data"] = file_path - if content[4] is None: + data[content["key_remote_jid"]].messages[content["message_row_id"]].data = file_path + if content["mime_type"] is None: guess = mime.guess_type(file_path)[0] if guess is not None: - data[content[0]]["messages"][content[1]]["mime"] = guess + data[content["key_remote_jid"]].messages[content["message_row_id"]].mime = guess else: - data[content[0]]["messages"][content[1]]["mime"] = "data/data" + data[content["key_remote_jid"]].messages[content["message_row_id"]].mime = "data/data" else: - data[content[0]]["messages"][content[1]]["mime"] = content[4] + data[content["key_remote_jid"]].messages[content["message_row_id"]].mime = content["mime_type"] else: # if "https://mmg" in content[4]: # try: @@ -390,9 +475,9 @@ def media(db, data, media_folder): # data[content[0]]["messages"][content[1]]["media"] = True # data[content[0]]["messages"][content[1]]["mime"] = "media" # else: - data[content[0]]["messages"][content[1]]["data"] = "The media is missing" - data[content[0]]["messages"][content[1]]["mime"] = "media" - data[content[0]]["messages"][content[1]]["meta"] = True + data[content["key_remote_jid"]].messages[content["message_row_id"]].data = "The media is missing" + data[content["key_remote_jid"]].messages[content["message_row_id"]].mime = "media" + data[content["key_remote_jid"]].messages[content["message_row_id"]].meta = True i += 1 if i % 100 == 0: print(f"Gathering media...({i}/{total_row_number})", end="\r") @@ -403,14 +488,31 @@ def media(db, data, media_folder): def vcard(db, data): c = db.cursor() - c.execute("""SELECT message_row_id, + try: + c.execute("""SELECT message_row_id, messages.key_remote_jid, vcard, messages.media_name FROM messages_vcards INNER JOIN messages ON messages_vcards.message_row_id = messages._id - ORDER BY messages.key_remote_jid ASC;""") + ORDER BY messages.key_remote_jid ASC;""" + ) + except sqlite3.OperationalError: + c.execute("""SELECT message_row_id, + jid.raw_string as key_remote_jid, + vcard, + message.text_data as media_name + FROM message_vcard + INNER JOIN message + ON message_vcard.message_row_id = message._id + LEFT JOIN chat + ON chat._id = message.chat_row_id + INNER JOIN jid + ON jid._id = chat.jid_row_id + ORDER BY message.chat_row_id ASC;""" + ) + rows = c.fetchall() total_row_number = len(rows) print(f"\nGathering vCards...(0/{total_row_number})", end="\r") @@ -418,21 +520,28 @@ def vcard(db, data): if not os.path.isdir(base): Path(base).mkdir(parents=True, exist_ok=True) for index, row in enumerate(rows): - media_name = row[3] if row[3] else "" + media_name = row["media_name"] file_name = "".join(x for x in media_name if x.isalnum()) file_path = f"{base}/{file_name}.vcf" if not os.path.isfile(file_path): with open(file_path, "w", encoding="utf-8") as f: f.write(row[2]) - data[row[1]]["messages"][row[0]]["data"] = media_name + \ + data[row["key_remote_jid"]].messages[row["message_row_id"]].data = media_name + \ "The vCard file cannot be displayed here, " \ f"however it should be located at {file_path}" - data[row[1]]["messages"][row[0]]["mime"] = "text/x-vcard" - data[row[1]]["messages"][row[0]]["meta"] = True + data[row["key_remote_jid"]].messages[row["message_row_id"]].mime = "text/x-vcard" + data[row["key_remote_jid"]].messages[row["message_row_id"]].meta = True print(f"Gathering vCards...({index + 1}/{total_row_number})", end="\r") -def create_html(data, output_folder, template=None, embedded=False, offline_static=False): +def create_html( + data, + output_folder, + template=None, + embedded=False, + offline_static=False, + maximum_size=None + ): if template is None: template_dir = os.path.dirname(__file__) template_file = "whatsapp.html" @@ -464,7 +573,7 @@ def create_html(data, output_folder, template=None, embedded=False, offline_stat w3css = os.path.join(offline_static, "w3.css") for current, contact in enumerate(data): - if len(data[contact]["messages"]) == 0: + if len(data[contact].messages) == 0: continue phone_number = contact.split('@')[0] if "-" in contact: @@ -472,11 +581,11 @@ def create_html(data, output_folder, template=None, embedded=False, offline_stat else: file_name = phone_number - if data[contact]["name"] is not None: + if data[contact].name is not None: if file_name != "": file_name += "-" - file_name += data[contact]["name"].replace("/", "-") - name = data[contact]["name"] + file_name += data[contact].name.replace("/", "-") + name = data[contact].name else: name = phone_number safe_file_name = '' @@ -485,7 +594,7 @@ def create_html(data, output_folder, template=None, embedded=False, offline_stat f.write( template.render( name=name, - msgs=data[contact]["messages"].values(), + msgs=data[contact].messages.values(), my_avatar=None, their_avatar=f"WhatsApp/Avatars/{contact}.j", w3css=w3css