2024-09-15 16:42:30 +00:00
|
|
|
|
import shutil
|
2024-09-13 01:58:51 +00:00
|
|
|
|
from datetime import datetime
|
2024-09-11 07:22:47 +00:00
|
|
|
|
import subprocess
|
2024-02-13 10:22:44 +00:00
|
|
|
|
from bs4 import BeautifulSoup
|
2024-02-13 05:45:57 +00:00
|
|
|
|
import util.RequestUtil as Request
|
2024-02-13 10:22:44 +00:00
|
|
|
|
import util.ToolsUtil as Tools
|
2024-02-13 12:47:18 +00:00
|
|
|
|
import util.ConfigUtil as Config
|
2024-09-14 12:27:15 +00:00
|
|
|
|
import util.GetAllMomentsUtil as GetAllMoments
|
2024-02-13 10:22:44 +00:00
|
|
|
|
import pandas as pd
|
2024-02-13 13:15:37 +00:00
|
|
|
|
import signal
|
2024-09-11 01:46:59 +00:00
|
|
|
|
import os
|
|
|
|
|
import re
|
2024-09-14 12:27:15 +00:00
|
|
|
|
from tqdm import trange, tqdm
|
2024-09-11 01:46:59 +00:00
|
|
|
|
import requests
|
|
|
|
|
import time
|
2024-09-11 07:07:51 +00:00
|
|
|
|
import platform
|
2024-11-13 10:36:46 +00:00
|
|
|
|
import chardet
|
2024-11-16 05:15:37 +00:00
|
|
|
|
import sys
|
|
|
|
|
import io
|
2024-02-13 13:15:37 +00:00
|
|
|
|
|
2024-11-16 05:15:37 +00:00
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
2024-09-15 16:10:59 +00:00
|
|
|
|
texts = list()
|
|
|
|
|
all_friends = list()
|
|
|
|
|
other_message = list()
|
|
|
|
|
user_message = list()
|
|
|
|
|
leave_message = list()
|
|
|
|
|
forward_message = list()
|
2024-09-14 12:27:15 +00:00
|
|
|
|
|
2024-09-16 12:13:00 +00:00
|
|
|
|
|
2024-02-13 13:15:37 +00:00
|
|
|
|
# 信号处理函数
|
|
|
|
|
def signal_handler(signal, frame):
|
|
|
|
|
# 在手动结束程序时保存已有的数据
|
|
|
|
|
if len(texts) > 0:
|
|
|
|
|
save_data()
|
|
|
|
|
exit(0)
|
|
|
|
|
|
|
|
|
|
|
2024-09-13 02:30:34 +00:00
|
|
|
|
def safe_strptime(date_str):
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 部分日期缺少最后的秒数,首先解析带秒数的日期格式,如果解析失败再解析不带秒数的日期
|
2024-09-13 02:30:34 +00:00
|
|
|
|
try:
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 尝试按照带秒格式解析日期
|
|
|
|
|
return datetime.strptime(date_str, "%Y年%m月%d日 %H:%M:%S")
|
2024-09-13 02:30:34 +00:00
|
|
|
|
except ValueError:
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 尝试按照不带秒格式解析日期
|
|
|
|
|
try:
|
|
|
|
|
return datetime.strptime(date_str, "%Y年%m月%d日 %H:%M")
|
|
|
|
|
except ValueError:
|
|
|
|
|
# 如果日期格式不对,返回 datetime.max
|
|
|
|
|
return datetime.max
|
|
|
|
|
|
2024-09-13 02:30:34 +00:00
|
|
|
|
|
2024-09-13 01:58:51 +00:00
|
|
|
|
# 还原QQ空间网页版说说
|
|
|
|
|
def render_html(shuoshuo_path, zhuanfa_path):
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 读取 Excel 文件内容
|
|
|
|
|
shuoshuo_df = pd.read_excel(shuoshuo_path)
|
|
|
|
|
zhuanfa_df = pd.read_excel(zhuanfa_path)
|
|
|
|
|
# 头像
|
|
|
|
|
avatar_url = f"https://q.qlogo.cn/headimg_dl?dst_uin={Request.uin}&spec=640&img_type=jpg"
|
|
|
|
|
# 提取说说列表中的数据
|
|
|
|
|
shuoshuo_data = shuoshuo_df[['时间', '内容', '图片链接', '评论']].values.tolist()
|
|
|
|
|
# 提取转发列表中的数据
|
|
|
|
|
zhuanfa_data = zhuanfa_df[['时间', '内容', '图片链接', '评论']].values.tolist()
|
|
|
|
|
# 合并所有数据
|
|
|
|
|
all_data = shuoshuo_data + zhuanfa_data
|
|
|
|
|
# 按时间排序
|
|
|
|
|
all_data.sort(key=lambda x: safe_strptime(x[0]) or datetime.min, reverse=True)
|
|
|
|
|
html_template, post_template, comment_template = Tools.get_html_template()
|
|
|
|
|
# 构建动态内容
|
|
|
|
|
post_html = ""
|
|
|
|
|
for entry in all_data:
|
|
|
|
|
try:
|
|
|
|
|
time, content, img_urls, comments = entry
|
|
|
|
|
img_url_lst = str(img_urls).split(",")
|
|
|
|
|
content_lst = content.split(":")
|
|
|
|
|
if len(content_lst) == 1:
|
|
|
|
|
continue
|
|
|
|
|
nickname = content_lst[0]
|
2024-09-16 12:16:38 +00:00
|
|
|
|
# 将nickname当中的QQ表情替换为img标签
|
|
|
|
|
nickname = re.sub(r'\[em\](.*?)\[/em\]', Tools.replace_em_to_img, nickname)
|
2024-09-14 12:27:15 +00:00
|
|
|
|
message = content_lst[1]
|
2024-09-16 12:16:38 +00:00
|
|
|
|
# 将message当中的QQ表情替换为img标签
|
|
|
|
|
message = re.sub(r'\[em\](.*?)\[/em\]', Tools.replace_em_to_img, message)
|
2024-09-14 12:27:15 +00:00
|
|
|
|
image_html = '<div class="image">'
|
|
|
|
|
for img_url in img_url_lst:
|
|
|
|
|
if img_url and img_url.startswith('http'):
|
2024-09-15 16:37:30 +00:00
|
|
|
|
# 将图片替换为高清图
|
|
|
|
|
img_url = str(img_url).replace("/m&ek=1&kp=1", "/s&ek=1&kp=1")
|
|
|
|
|
img_url = str(img_url).replace(r"!/m/", "!/s/")
|
2024-09-14 12:27:15 +00:00
|
|
|
|
image_html += f'<img src="{img_url}" alt="图片">\n'
|
|
|
|
|
image_html += "</div>"
|
|
|
|
|
comment_html = ""
|
|
|
|
|
# 获取评论数据
|
|
|
|
|
if str(comments) != "nan":
|
|
|
|
|
comments = eval(comments)
|
|
|
|
|
for comment in comments:
|
|
|
|
|
comment_create_time, comment_content, comment_nickname, comment_uin = comment
|
2024-09-16 12:16:38 +00:00
|
|
|
|
# 将评论人昵称和评论内容当中的QQ表情替换为img标签
|
|
|
|
|
comment_nickname = re.sub(r'\[em\](.*?)\[/em\]', Tools.replace_em_to_img, comment_nickname)
|
|
|
|
|
comment_content = re.sub(r'\[em\](.*?)\[/em\]', Tools.replace_em_to_img, comment_content)
|
2024-09-14 12:27:15 +00:00
|
|
|
|
comment_avatar_url = f"https://q.qlogo.cn/headimg_dl?dst_uin={comment_uin}&spec=640&img_type=jpg"
|
|
|
|
|
comment_html += comment_template.format(
|
|
|
|
|
avatar_url=comment_avatar_url,
|
|
|
|
|
nickname=comment_nickname,
|
|
|
|
|
time=comment_create_time,
|
|
|
|
|
message=comment_content
|
|
|
|
|
)
|
|
|
|
|
# 生成每个动态的HTML块
|
|
|
|
|
post_html += post_template.format(
|
|
|
|
|
avatar_url=avatar_url,
|
|
|
|
|
nickname=nickname,
|
|
|
|
|
time=time,
|
|
|
|
|
message=message,
|
|
|
|
|
image=image_html,
|
|
|
|
|
comments=comment_html
|
|
|
|
|
)
|
|
|
|
|
except Exception as err:
|
|
|
|
|
print(err)
|
|
|
|
|
|
|
|
|
|
# 生成完整的HTML
|
|
|
|
|
final_html = html_template.format(posts=post_html)
|
|
|
|
|
user_save_path = Config.result_path + Request.uin + '/'
|
|
|
|
|
# 将HTML写入文件
|
|
|
|
|
output_file = os.path.join(os.getcwd(), user_save_path, Request.uin + "_说说网页版.html")
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
f.write(final_html)
|
2024-09-13 01:58:51 +00:00
|
|
|
|
|
2024-09-11 01:46:59 +00:00
|
|
|
|
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 保存数据
|
2024-09-11 01:46:59 +00:00
|
|
|
|
def save_data():
|
|
|
|
|
user_save_path = Config.result_path + Request.uin + '/'
|
|
|
|
|
pic_save_path = user_save_path + 'pic/'
|
|
|
|
|
if not os.path.exists(user_save_path):
|
|
|
|
|
os.makedirs(user_save_path)
|
|
|
|
|
print(f"Created directory: {user_save_path}")
|
|
|
|
|
if not os.path.exists(pic_save_path):
|
|
|
|
|
os.makedirs(pic_save_path)
|
|
|
|
|
print(f"Created directory: {pic_save_path}")
|
2024-09-14 12:27:15 +00:00
|
|
|
|
pd.DataFrame(texts, columns=['时间', '内容', '图片链接', '评论']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_全部列表.xlsx',
|
|
|
|
|
index=False)
|
|
|
|
|
pd.DataFrame(all_friends, columns=['昵称', 'QQ', '空间主页']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_好友列表.xlsx', index=False)
|
2024-09-11 01:46:59 +00:00
|
|
|
|
for item in tqdm(texts, desc="处理消息列表", unit="item"):
|
|
|
|
|
item_text = item[1]
|
2024-09-14 12:27:15 +00:00
|
|
|
|
# 可见说说中可能存在多张图片
|
|
|
|
|
item_pic_links = str(item[2]).split(",")
|
|
|
|
|
for item_pic_link in item_pic_links:
|
2024-09-16 12:13:00 +00:00
|
|
|
|
# 如果图片链接为空或者不是http链接,则跳过
|
|
|
|
|
if not item_pic_link or len(item_pic_link) == 0 or 'http' not in item_pic_link:
|
|
|
|
|
continue
|
|
|
|
|
# 去除非法字符 / Emoji表情
|
|
|
|
|
pic_name = re.sub(r'\[em\].*?\[/em\]|[^\w\s]|[\\/:*?"<>|\r\n]+', '_', item_text).replace(" ", "") + '.jpg'
|
|
|
|
|
# 去除文件名中的空格
|
|
|
|
|
pic_name = pic_name.replace(' ', '')
|
|
|
|
|
# 限制文件名长度
|
|
|
|
|
if len(pic_name) > 40:
|
|
|
|
|
pic_name = pic_name[:40] + '.jpg'
|
|
|
|
|
# pic_name = pic_name.split(':')[1] + '.jpg'
|
|
|
|
|
response = requests.get(item_pic_link)
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
# 防止图片重名
|
|
|
|
|
if os.path.exists(pic_save_path + pic_name):
|
|
|
|
|
pic_name = pic_name.split('.')[0] + "_" + str(int(time.time())) + '.jpg'
|
|
|
|
|
with open(pic_save_path + pic_name, 'wb') as f:
|
|
|
|
|
f.write(response.content)
|
2024-09-11 01:46:59 +00:00
|
|
|
|
if user_nickname in item_text:
|
|
|
|
|
if '留言' in item_text:
|
2024-09-14 12:27:15 +00:00
|
|
|
|
leave_message.append(item[:-1])
|
2024-09-11 01:46:59 +00:00
|
|
|
|
elif '转发' in item_text:
|
|
|
|
|
forward_message.append(item)
|
|
|
|
|
else:
|
|
|
|
|
user_message.append(item)
|
|
|
|
|
else:
|
2024-09-14 12:27:15 +00:00
|
|
|
|
other_message.append(item[:-1])
|
|
|
|
|
pd.DataFrame(user_message, columns=['时间', '内容', '图片链接', '评论']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_说说列表.xlsx', index=False)
|
|
|
|
|
pd.DataFrame(forward_message, columns=['时间', '内容', '图片链接', '评论']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_转发列表.xlsx', index=False)
|
|
|
|
|
pd.DataFrame(leave_message, columns=['时间', '内容', '图片链接']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_留言列表.xlsx', index=False)
|
|
|
|
|
pd.DataFrame(other_message, columns=['时间', '内容', '图片链接']).to_excel(
|
|
|
|
|
user_save_path + Request.uin + '_其他列表.xlsx', index=False)
|
2024-09-13 01:58:51 +00:00
|
|
|
|
render_html(user_save_path + Request.uin + '_说说列表.xlsx', user_save_path + Request.uin + '_转发列表.xlsx')
|
2024-09-12 15:10:29 +00:00
|
|
|
|
Tools.show_author_info()
|
2024-09-11 01:46:59 +00:00
|
|
|
|
print('\033[36m' + '导出成功,请查看 ' + user_save_path + Request.uin + ' 文件夹内容' + '\033[0m')
|
|
|
|
|
print('\033[32m' + '共有 ' + str(len(texts)) + ' 条消息' + '\033[0m')
|
|
|
|
|
print('\033[36m' + '最早的一条说说发布在' + texts[texts.__len__() - 1][0] + '\033[0m')
|
|
|
|
|
print('\033[32m' + '好友列表共有 ' + str(len(all_friends)) + ' 个好友' + '\033[0m')
|
|
|
|
|
print('\033[36m' + '说说列表共有 ' + str(len(user_message)) + ' 条说说' + '\033[0m')
|
|
|
|
|
print('\033[32m' + '转发列表共有 ' + str(len(forward_message)) + ' 条转发' + '\033[0m')
|
|
|
|
|
print('\033[36m' + '留言列表共有 ' + str(len(leave_message)) + ' 条留言' + '\033[0m')
|
|
|
|
|
print('\033[32m' + '其他列表共有 ' + str(len(other_message)) + ' 条内容' + '\033[0m')
|
|
|
|
|
print('\033[36m' + '图片列表共有 ' + str(len(os.listdir(pic_save_path))) + ' 张图片' + '\033[0m')
|
|
|
|
|
current_directory = os.getcwd()
|
2024-09-11 07:22:47 +00:00
|
|
|
|
# os.startfile(current_directory + user_save_path[1:])
|
|
|
|
|
open_file(current_directory + user_save_path[1:])
|
2024-11-14 05:43:07 +00:00
|
|
|
|
if platform.system() == 'Windows':
|
|
|
|
|
os.system('pause')
|
|
|
|
|
else:
|
|
|
|
|
os.system('stty raw -echo;dd bs=1 count=1 >/dev/null 2>&1;stty cooked echo')
|
2024-09-14 12:27:15 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 打开文件展示
|
2024-09-11 07:22:47 +00:00
|
|
|
|
def open_file(file_path):
|
|
|
|
|
# 检查操作系统
|
|
|
|
|
if platform.system() == 'Windows':
|
|
|
|
|
# Windows 系统使用 os.startfile
|
|
|
|
|
os.startfile(file_path)
|
|
|
|
|
elif platform.system() == 'Darwin':
|
|
|
|
|
# macOS 系统使用 subprocess 和 open 命令
|
|
|
|
|
subprocess.run(['open', file_path])
|
|
|
|
|
elif platform.system() == 'Linux':
|
2024-09-15 16:42:30 +00:00
|
|
|
|
# Linux 系统,首先检查是否存在 xdg-open 工具
|
|
|
|
|
if shutil.which('xdg-open'):
|
|
|
|
|
subprocess.run(['xdg-open', file_path])
|
|
|
|
|
# 如果 xdg-open 不存在,检查是否存在 gnome-open 工具(适用于 GNOME 桌面环境)
|
|
|
|
|
elif shutil.which('gnome-open'):
|
|
|
|
|
subprocess.run(['gnome-open', file_path])
|
|
|
|
|
# 如果 gnome-open 不存在,检查是否存在 kde-open 工具(适用于 KDE 桌面环境)
|
|
|
|
|
elif shutil.which('kde-open'):
|
|
|
|
|
subprocess.run(['kde-open', file_path])
|
|
|
|
|
# 如果以上工具都不存在,提示用户手动打开文件
|
|
|
|
|
else:
|
|
|
|
|
print("未找到可用的打开命令,请手动打开文件。")
|
2024-09-11 07:22:47 +00:00
|
|
|
|
else:
|
|
|
|
|
print(f"Unsupported OS: {platform.system()}")
|
|
|
|
|
|
2024-02-13 12:47:18 +00:00
|
|
|
|
|
2024-02-12 09:29:14 +00:00
|
|
|
|
if __name__ == '__main__':
|
2024-02-14 01:12:18 +00:00
|
|
|
|
try:
|
|
|
|
|
user_info = Request.get_login_user_info()
|
|
|
|
|
user_nickname = user_info[Request.uin][6]
|
|
|
|
|
print(f"用户<{Request.uin}>,<{user_nickname}>登录成功")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"登录失败:请重新登录,错误信息:{str(e)}")
|
|
|
|
|
exit(0)
|
2024-09-15 16:10:59 +00:00
|
|
|
|
|
2024-02-14 13:15:17 +00:00
|
|
|
|
count = Request.get_message_count()
|
2024-02-13 13:15:37 +00:00
|
|
|
|
try:
|
|
|
|
|
# 注册信号处理函数
|
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
|
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
|
|
|
|
for i in trange(int(count / 10) + 1, desc='Progress', unit='10条'):
|
|
|
|
|
# 每次获取10条数据
|
|
|
|
|
response = Request.get_message(i * 10, 10)
|
|
|
|
|
if response is None or not hasattr(response, 'content'):
|
|
|
|
|
print(f"获取消息失败:第 {i} 批次,返回值为空或无效")
|
|
|
|
|
continue
|
|
|
|
|
content_bytes = response.content
|
2024-11-13 10:36:46 +00:00
|
|
|
|
detected_encoding = chardet.detect(content_bytes)['encoding']
|
|
|
|
|
message = content_bytes.decode(detected_encoding if detected_encoding else "utf-8")
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
|
|
|
|
# 处理HTML数据
|
2024-02-13 13:15:37 +00:00
|
|
|
|
html = Tools.process_old_html(message)
|
|
|
|
|
if "li" not in html:
|
2024-09-10 07:54:17 +00:00
|
|
|
|
continue
|
2024-02-13 13:15:37 +00:00
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
2024-02-14 11:26:39 +00:00
|
|
|
|
for element in soup.find_all('li', class_='f-single f-s-s'):
|
2024-09-11 01:46:59 +00:00
|
|
|
|
put_time = None
|
2024-02-14 11:26:39 +00:00
|
|
|
|
text = None
|
2024-09-11 01:46:59 +00:00
|
|
|
|
img = None
|
|
|
|
|
friend_element = element.find('a', class_='f-name q_namecard')
|
|
|
|
|
# 获取好友昵称和QQ
|
|
|
|
|
if friend_element is not None:
|
|
|
|
|
friend_name = friend_element.get_text()
|
|
|
|
|
friend_qq = friend_element.get('link')[9:]
|
|
|
|
|
friend_link = friend_element.get('href')
|
|
|
|
|
if friend_qq not in [sublist[1] for sublist in all_friends]:
|
|
|
|
|
all_friends.append([friend_name, friend_qq, friend_link])
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
2024-02-14 11:26:39 +00:00
|
|
|
|
time_element = element.find('div', class_='info-detail')
|
|
|
|
|
text_element = element.find('p', class_='txt-box-title ellipsis-one')
|
2024-09-11 01:46:59 +00:00
|
|
|
|
img_element = element.find('a', class_='img-item')
|
2024-02-14 11:26:39 +00:00
|
|
|
|
if time_element is not None and text_element is not None:
|
2024-09-11 01:46:59 +00:00
|
|
|
|
put_time = time_element.get_text().replace('\xa0', ' ')
|
2024-02-14 11:26:39 +00:00
|
|
|
|
text = text_element.get_text().replace('\xa0', ' ')
|
2024-09-11 01:46:59 +00:00
|
|
|
|
if img_element is not None:
|
|
|
|
|
img = img_element.find('img').get('src')
|
2024-02-14 13:15:17 +00:00
|
|
|
|
if text not in [sublist[1] for sublist in texts]:
|
2024-09-11 01:46:59 +00:00
|
|
|
|
texts.append([put_time, text, img])
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
|
|
|
|
# 每读取10条后休息3秒
|
|
|
|
|
time.sleep(3)
|
|
|
|
|
print("Pause for 3 seconds...")
|
|
|
|
|
|
2024-02-13 13:15:37 +00:00
|
|
|
|
except Exception as e:
|
2024-09-14 12:27:15 +00:00
|
|
|
|
print(f"获取QQ空间互动消息发生异常: {str(e)}")
|
2024-11-16 05:15:37 +00:00
|
|
|
|
|
|
|
|
|
# 确保texts是四列,防止后续保存结果出现问题
|
|
|
|
|
texts = [t + [""] for t in texts]
|
|
|
|
|
|
2024-09-14 12:27:15 +00:00
|
|
|
|
try:
|
|
|
|
|
user_moments = GetAllMoments.get_visible_moments_list()
|
|
|
|
|
if user_moments and len(user_moments) > 0:
|
|
|
|
|
# 如果可见说说的内容是从消息列表恢复的说说内容子集,则不添加到消息列表中
|
|
|
|
|
texts = [t for t in texts if
|
2024-09-16 12:22:20 +00:00
|
|
|
|
not any(Tools.is_any_mutual_exist(t[1], u[1]) for u in user_moments)]
|
2024-09-14 12:27:15 +00:00
|
|
|
|
texts.extend(user_moments)
|
|
|
|
|
except Exception as err:
|
|
|
|
|
print(f"获取未删除QQ空间记录发生异常: {str(err)}")
|
|
|
|
|
|
|
|
|
|
if len(texts) > 0:
|
|
|
|
|
save_data()
|