合并获取未删除说说功能至主线程,优化部分代码

This commit is contained in:
SwimmingLiu 2024-09-14 20:27:15 +08:00
parent 238dc0d8f3
commit ffc2818f18
5 changed files with 383 additions and 104 deletions

View File

@ -9,24 +9,29 @@
```text
project/
├── resource/ # 资源目录
│ ├── config/ # 配置目录,文件保存位置配置
├── resource/ # 资源目录
│ ├── config/ # 配置目录,文件保存位置配置
│ │ └── config.ini
│ ├── result/ # 导出结果的目录格式为“你的qq.xlsx”
│ ├── result/ # 导出结果的目录格式为“你的qq.xlsx”
│ │ ├── ...
│ │ └── ...
│ ├── temp/ # 缓存目录
│ ├── temp/ # 缓存目录
│ │ ├── ...
│ │ └── ...
├── util/ # 单元工具目录
│ ├── ConfigUtil.py # 读取配置
│ ├── LoginUtil.py # 登录相关
│ ├── RequestUtil.py # 请求数据相关
│ └── ToolsUtil.py # 工具
├── main.py # 主程序入口
├── README.md # 项目说明文件
├── requirements.txt # 依赖项列表
└── LICENSE # 许可证文件
│ ├── user/ # 用户信息
│ │ ├── ...
│ │ └── ...
├── util/ # 单元工具目录
│ ├── ConfigUtil.py # 读取配置
│ ├── GetAllMomentsUtil.py # 获取未删除的所有说说
│ ├── LoginUtil.py # 登录相关
│ ├── RequestUtil.py # 请求数据相关
│ └── ToolsUtil.py # 工具
├── main.py # 主程序入口
├── fetch_all_message.py # 主程序入口
├── README.md # 项目说明文件
├── requirements.txt # 依赖项列表
└── LICENSE # 许可证文件
```
## 安装

214
main.py
View File

@ -6,15 +6,17 @@ from tqdm import trange
import util.RequestUtil as Request
import util.ToolsUtil as Tools
import util.ConfigUtil as Config
import util.GetAllMomentsUtil as GetAllMoments
import pandas as pd
import signal
import os
import re
from tqdm import trange,tqdm
from tqdm import trange, tqdm
import requests
import time
import platform
# 信号处理函数
def signal_handler(signal, frame):
# 在手动结束程序时保存已有的数据
@ -24,65 +26,86 @@ def signal_handler(signal, frame):
def safe_strptime(date_str):
# 部分日期缺少最后的秒数,首先解析带秒数的日期格式,如果解析失败再解析不带秒数的日期
try:
# 尝试按照指定格式解析日期
return datetime.strptime(date_str, "%Y年%m月%d%H:%M")
# 尝试按照带秒格式解析日期
return datetime.strptime(date_str, "%Y年%m月%d%H:%M:%S")
except ValueError:
# 如果日期格式不对,返回 datetime.max
return datetime.max
# 尝试按照不带秒格式解析日期
try:
return datetime.strptime(date_str, "%Y年%m月%d%H:%M")
except ValueError:
# 如果日期格式不对,返回 datetime.max
return datetime.max
# 还原QQ空间网页版说说
def render_html(shuoshuo_path, zhuanfa_path):
# 读取 Excel 文件内容
shuoshuo_df = pd.read_excel(shuoshuo_path)
zhuanfa_df = pd.read_excel(zhuanfa_path)
# 头像
avatar_url = f"https://q.qlogo.cn/headimg_dl?dst_uin={Request.uin}&spec=640&img_type=jpg"
# 提取说说列表中的数据
shuoshuo_data = shuoshuo_df[['时间', '内容', '图片链接']].values.tolist()
# 提取转发列表中的数据
zhuanfa_data = zhuanfa_df[['时间', '内容', '图片链接']].values.tolist()
# 合并所有数据
all_data = shuoshuo_data + zhuanfa_data
# 按时间排序
all_data.sort(key=lambda x: safe_strptime(x[0]) or datetime.min, reverse=True)
html_template, post_template = Tools.get_html_template()
# 构建动态内容
post_html = ""
for entry in all_data:
try:
time, content, img_url = entry
img_url = str(img_url)
content_lst = content.split("")
if len(content_lst) == 1:
continue
nickname = content_lst[0]
message = content_lst[1]
# 读取 Excel 文件内容
shuoshuo_df = pd.read_excel(shuoshuo_path)
zhuanfa_df = pd.read_excel(zhuanfa_path)
# 头像
avatar_url = f"https://q.qlogo.cn/headimg_dl?dst_uin={Request.uin}&spec=640&img_type=jpg"
# 提取说说列表中的数据
shuoshuo_data = shuoshuo_df[['时间', '内容', '图片链接', '评论']].values.tolist()
# 提取转发列表中的数据
zhuanfa_data = zhuanfa_df[['时间', '内容', '图片链接', '评论']].values.tolist()
# 合并所有数据
all_data = shuoshuo_data + zhuanfa_data
# 按时间排序
all_data.sort(key=lambda x: safe_strptime(x[0]) or datetime.min, reverse=True)
html_template, post_template, comment_template = Tools.get_html_template()
# 构建动态内容
post_html = ""
for entry in all_data:
try:
time, content, img_urls, comments = entry
img_url_lst = str(img_urls).split(",")
content_lst = content.split("")
if len(content_lst) == 1:
continue
nickname = content_lst[0]
message = content_lst[1]
image_html = '<div class="image">'
for img_url in img_url_lst:
if img_url and img_url.startswith('http'):
image_html += f'<img src="{img_url}" alt="图片">\n'
image_html += "</div>"
comment_html = ""
# 获取评论数据
if str(comments) != "nan":
comments = eval(comments)
for comment in comments:
comment_create_time, comment_content, comment_nickname, comment_uin = comment
comment_avatar_url = f"https://q.qlogo.cn/headimg_dl?dst_uin={comment_uin}&spec=640&img_type=jpg"
comment_html += comment_template.format(
avatar_url=comment_avatar_url,
nickname=comment_nickname,
time=comment_create_time,
message=comment_content
)
# 生成每个动态的HTML块
post_html += post_template.format(
avatar_url=avatar_url,
nickname=nickname,
time=time,
message=message,
image=image_html,
comments=comment_html
)
except Exception as err:
print(err)
image_html = f'<div class="image"><img src="{img_url}" alt="图片"></div>' if img_url and img_url.startswith(
'http') else ''
# 生成每个动态的HTML块
post_html += post_template.format(
avatar_url=avatar_url,
nickname=nickname,
time=time,
message=message,
image=image_html
)
except Exception as err:
print(err)
# 生成完整的HTML
final_html = html_template.format(posts=post_html)
user_save_path = Config.result_path + Request.uin + '/'
# 将HTML写入文件
output_file = os.path.join(os.getcwd(), user_save_path, Request.uin + "_说说网页版.html")
with open(output_file, 'w', encoding='utf-8') as f:
f.write(final_html)
# 生成完整的HTML
final_html = html_template.format(posts=post_html)
user_save_path = Config.result_path + Request.uin + '/'
# 将HTML写入文件
output_file = os.path.join(os.getcwd(), user_save_path, Request.uin + "_说说网页版.html")
with open(output_file, 'w', encoding='utf-8') as f:
f.write(final_html)
# 保存数据
def save_data():
user_save_path = Config.result_path + Request.uin + '/'
pic_save_path = user_save_path + 'pic/'
@ -92,38 +115,50 @@ def save_data():
if not os.path.exists(pic_save_path):
os.makedirs(pic_save_path)
print(f"Created directory: {pic_save_path}")
pd.DataFrame(texts, columns=['时间', '内容', '图片链接']).to_excel(user_save_path + Request.uin + '_全部列表.xlsx', index=False)
pd.DataFrame(all_friends, columns=['昵称', 'QQ', '空间主页']).to_excel(user_save_path + Request.uin + '_好友列表.xlsx', index=False)
pd.DataFrame(texts, columns=['时间', '内容', '图片链接', '评论']).to_excel(
user_save_path + Request.uin + '_全部列表.xlsx',
index=False)
pd.DataFrame(all_friends, columns=['昵称', 'QQ', '空间主页']).to_excel(
user_save_path + Request.uin + '_好友列表.xlsx', index=False)
for item in tqdm(texts, desc="处理消息列表", unit="item"):
item_text = item[1]
item_pic_link = item[2]
if item_pic_link is not None and len(item_pic_link) > 0 and 'http' in item_pic_link:
# 保存图片
pic_name = re.sub(r'[\\/:*?"<>|]', '_', item_text) + '.jpg'
# 可见说说中可能存在多张图片
item_pic_links = str(item[2]).split(",")
for item_pic_link in item_pic_links:
if item_pic_link is not None and len(item_pic_link) > 0 and 'http' in item_pic_link:
# 保存图片
pic_name = re.sub(r'[\\/:*?"<>|]', '_', item_text) + '.jpg'
# 去除文件名中的空格
pic_name = pic_name.replace(' ', '')
# 限制文件名长度
if len(pic_name) > 40:
pic_name = pic_name[:40] + '.jpg'
# pic_name = pic_name.split('')[1] + '.jpg'
response = requests.get(item_pic_link)
if response.status_code == 200:
with open(pic_save_path + pic_name, 'wb') as f:
f.write(response.content)
pic_name = pic_name.replace(' ', '')
# 限制文件名长度
if len(pic_name) > 40:
pic_name = pic_name[:40] + '.jpg'
# pic_name = pic_name.split('')[1] + '.jpg'
response = requests.get(item_pic_link)
if response.status_code == 200:
# 防止图片重名
if os.path.exists(pic_save_path + pic_name):
pic_name = pic_name.split('.')[0] + "_" + str(int(time.time())) + '.jpg'
with open(pic_save_path + pic_name, 'wb') as f:
f.write(response.content)
if user_nickname in item_text:
if '留言' in item_text:
leave_message.append(item)
leave_message.append(item[:-1])
elif '转发' in item_text:
forward_message.append(item)
else:
user_message.append(item)
else:
other_message.append(item)
pd.DataFrame(user_message, columns=['时间', '内容', '图片链接']).to_excel(user_save_path + Request.uin + '_说说列表.xlsx', index=False)
pd.DataFrame(forward_message, columns=['时间', '内容', '图片链接']).to_excel(user_save_path + Request.uin + '_转发列表.xlsx', index=False)
pd.DataFrame(leave_message, columns=['时间', '内容', '图片链接']).to_excel(user_save_path + Request.uin + '_留言列表.xlsx', index=False)
pd.DataFrame(other_message, columns=['时间', '内容', '图片链接']).to_excel(user_save_path + Request.uin + '_其他列表.xlsx', index=False)
other_message.append(item[:-1])
pd.DataFrame(user_message, columns=['时间', '内容', '图片链接', '评论']).to_excel(
user_save_path + Request.uin + '_说说列表.xlsx', index=False)
pd.DataFrame(forward_message, columns=['时间', '内容', '图片链接', '评论']).to_excel(
user_save_path + Request.uin + '_转发列表.xlsx', index=False)
pd.DataFrame(leave_message, columns=['时间', '内容', '图片链接']).to_excel(
user_save_path + Request.uin + '_留言列表.xlsx', index=False)
pd.DataFrame(other_message, columns=['时间', '内容', '图片链接']).to_excel(
user_save_path + Request.uin + '_其他列表.xlsx', index=False)
render_html(user_save_path + Request.uin + '_说说列表.xlsx', user_save_path + Request.uin + '_转发列表.xlsx')
Tools.show_author_info()
print('\033[36m' + '导出成功,请查看 ' + user_save_path + Request.uin + ' 文件夹内容' + '\033[0m')
@ -139,8 +174,9 @@ def save_data():
# os.startfile(current_directory + user_save_path[1:])
open_file(current_directory + user_save_path[1:])
os.system('pause')
# 打开文件展示
def open_file(file_path):
# 检查操作系统
if platform.system() == 'Windows':
@ -156,6 +192,11 @@ def open_file(file_path):
print(f"Unsupported OS: {platform.system()}")
def get_content_from_split(content):
content_split = str(content).split("")
return content_split[1].strip() if len(content_split) > 1 else content.strip()
if __name__ == '__main__':
try:
user_info = Request.get_login_user_info()
@ -175,7 +216,6 @@ if __name__ == '__main__':
# 注册信号处理函数
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
for i in trange(int(count / 100) + 1, desc='Progress', unit='100条'):
message = Request.get_message(i * 100, 100).content.decode('utf-8')
time.sleep(0.2)
@ -205,10 +245,18 @@ if __name__ == '__main__':
img = img_element.find('img').get('src')
if text not in [sublist[1] for sublist in texts]:
texts.append([put_time, text, img])
if len(texts) > 0:
save_data()
except Exception as e:
print(f"发生异常: {str(e)}")
if len(texts) > 0:
save_data()
print(f"获取QQ空间互动消息发生异常: {str(e)}")
texts = [t + [""] for t in texts] # 确保texts是四列, 防止后续保存结果出现问题
try:
user_moments = GetAllMoments.get_visible_moments_list()
if user_moments and len(user_moments) > 0:
# 如果可见说说的内容是从消息列表恢复的说说内容子集,则不添加到消息列表中
texts = [t for t in texts if
not any(get_content_from_split(u[1]) in get_content_from_split(t[1]) for u in user_moments)]
texts.extend(user_moments)
except Exception as err:
print(f"获取未删除QQ空间记录发生异常: {str(err)}")
if len(texts) > 0:
save_data()

137
util/GetAllMomentsUtil.py Normal file
View File

@ -0,0 +1,137 @@
import json
import os
import re
import sys
import time
import requests
from tqdm import tqdm
from util import RequestUtil as Request
from util import LoginUtil
from util import ToolsUtil as Tool
WORKDIR = "./resource/fetch-all/"
USER_QZONE_INFO = 'user_qzone_info.json'
QZONE_MOMENTS_ALL = 'qzone_moments_all.json'
# 获取所有可见的未删除的说说+高清图片包含2014年之前
def get_visible_moments_list():
# 1. 获取说说总条数
user_qzone_info = Tool.read_txt_file(WORKDIR, USER_QZONE_INFO)
if not user_qzone_info:
# 样本缓存未找到,开始请求获取样本
qq_userinfo_response = get_user_qzone_info(1)
Tool.write_txt_file(WORKDIR, USER_QZONE_INFO, qq_userinfo_response)
user_qzone_info = Tool.read_txt_file(WORKDIR, USER_QZONE_INFO)
if not Tool.is_valid_json(user_qzone_info):
print("获取QQ空间信息失败")
return None
json_dict = json.loads(user_qzone_info)
totalMomentsCount = json_dict['total']
print(f'你的未删除说说总条数{totalMomentsCount}')
# 当前未删除说说总数为0, 直接返回
if totalMomentsCount == 0:
return None
# 2. 获取所有说说数据
print("开始获取所有未删除说说")
qzone_moments_all = Tool.read_txt_file(WORKDIR, QZONE_MOMENTS_ALL)
if not qzone_moments_all:
# 缓存未找到,开始请求获取所有未删除说说
qq_userinfo_response = get_user_qzone_info(totalMomentsCount)
Tool.write_txt_file(WORKDIR, QZONE_MOMENTS_ALL, qq_userinfo_response)
qzone_moments_all = Tool.read_txt_file(WORKDIR, QZONE_MOMENTS_ALL)
if not Tool.is_valid_json(qzone_moments_all):
print("获取QQ空间说说失败")
return None
json_dict = json.loads(qzone_moments_all)
qzone_moments_list = json_dict['msglist']
print(f'已获取到数据的说说总条数{len(qzone_moments_list)}')
# 3. 添加说说列表
texts = []
for item in tqdm(qzone_moments_list, desc="获取未删除说说", unit=""):
content = item['content'] if item['content'] else ""
nickname = item['name']
create_time = Tool.format_timestamp(item['created_time'])
pictures = ""
# 如果有图片
if 'pic' in item:
for index, picture in enumerate(item['pic']):
pictures += picture['url1'] + ","
# 去除最后一个逗号
pictures = pictures[:-1] if pictures != "" else pictures
comments = []
if 'commentlist' in item:
for index, commentToMe in enumerate(item['commentlist']):
comment_content = commentToMe['content']
comment_create_time = commentToMe['createTime2']
comment_nickname = commentToMe['name']
comment_uin = commentToMe['uin']
# 时间内容昵称QQ号
comments.append([comment_create_time, comment_content, comment_nickname, comment_uin])
# 格式:时间、内容、图片链接、转发内容、评论内容
texts.append([create_time, f"{nickname} {content}", pictures, comments])
return texts
# 获取用户QQ空间相关信息
def get_user_qzone_info(num):
url = 'https://user.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6'
cookies = Request.cookies
g_tk = LoginUtil.bkn(cookies.get('p_skey'))
qqNumber = re.sub(r'o0*', '', cookies.get('uin'))
skey = cookies.get('skey')
p_uin = cookies.get('p_uin')
pt4_token = cookies.get('pt4_token')
p_skey = cookies.get('p_skey')
headers = {
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'cookie': f'uin={p_uin};skey={skey};p_uin={p_uin};pt4_token={pt4_token};p_skey={p_skey}',
'priority': 'u=1, i',
'referer': f'https://user.qzone.qq.com/{qqNumber}/main',
'sec-ch-ua': '"Not;A=Brand";v="24", "Chromium";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
}
params = {
'uin': f'{qqNumber}',
'ftype': '0',
'sort': '0',
'pos': '0',
'num': f'{num}',
'replynum': '100',
'g_tk': f'{g_tk}',
'callback': '_preloadCallback',
'code_version': '1',
'format': 'jsonp',
'need_private_comment': '1'
}
try:
response = requests.get(url, headers=headers, params=params)
except Exception as e:
print(e)
rawResponse = response.text
# 使用正则表达式去掉 _preloadCallback(),并提取其中的 JSON 数据
raw_txt = re.sub(r'^_preloadCallback\((.*)\);?$', r'\1', rawResponse, flags=re.S)
# 再转一次是为了去掉响应值本身自带的转义符http:\/\/
json_dict = json.loads(raw_txt)
if json_dict['code'] != 0:
print(f"错误 {json_dict['message']}")
sys.exit(1)
return json.dumps(json_dict, indent=2, ensure_ascii=False)
if __name__ == '__main__':
get_visible_moments_list()

View File

@ -91,7 +91,7 @@ def cookie():
print(time.strftime('%H:%M:%S'), '二维码认证中')
elif '二维码已失效' in r.text:
print(time.strftime('%H:%M:%S'), '二维码已失效')
else:
elif '登录成功' in r.text:
print(time.strftime('%H:%M:%S'), '登录成功')
cookies = requests.utils.dict_from_cookiejar(r.cookies)
uin = requests.utils.dict_from_cookiejar(r.cookies).get('uin')
@ -111,6 +111,8 @@ def cookie():
except Exception as e:
print(e)
else:
print(time.strftime('%H:%M:%S'), '用户取消登录')
except Exception as e:
print(e)

View File

@ -1,4 +1,7 @@
import re
import json
import os
import time
# 提取两个字符串之间的内容
@ -37,7 +40,7 @@ def show_author_info():
BLUE = '\033[34m'
RESET = '\033[0m'
RED = '\033[31m'
author_art = r'''
@ -49,7 +52,7 @@ def show_author_info():
'''
print(CYAN + author_art + RESET)
author_info = f"{YELLOW}bilibili{RESET} {BLUE}@高数带我飞{RESET} {YELLOW}GetQzonehistory V1.0{RESET}"
print(author_info)
print(f'{RED}程序完全免费且在github开源{RESET}')
@ -82,14 +85,14 @@ def get_html_template():
float: left;
margin-right: 20px;
}}
.content {{
overflow: hidden;
}}
.avatar img {{
width: 50px;
height: 50px;
border-radius: 50%;
}}
.content {{
overflow: hidden;
}}
.nickname {{
font-size: 1.2em;
font-weight: bold;
@ -104,15 +107,47 @@ def get_html_template():
}}
.image {{
margin-top: 10px;
display: flex;
justify-content: space-around;
align-items: center; /* 使两张图片垂直对齐 */
padding: 20px;
}}
.image img {{
max-width: 100%;
max-width: 33vw;
max-height: 33vh;
border-radius: 10px;
}}
.comments {{
margin-top: 5px; /* 调整这里的值来减少间距 */
background-color: #444;
padding: 2px 10px 10px 10px;
border-radius: 10px;
}}
.comment {{
margin-top: 10px; /* 调整单个评论之间的间距 */
padding: 10px;
background-color: #555;
border-radius: 10px;
color: #fff;
}}
.comment .avatar img {{
width: 30px;
height: 30px;
}}
.comment .nickname {{
font-size: 1em;
font-weight: bold;
}}
.comment .time {{
font-size: 0.8em;
color: #aaa;
}}
</style>
</head>
<body>
{posts}
</body>
</html>
"""
@ -129,7 +164,59 @@ def get_html_template():
<div class="message">{message}</div>
{image}
</div>
{comments}
</div>
"""
return html_template, post_template
# 评论区HTML模板
comment_template = """
<div class="comments">
<div class="comment">
<div class="avatar">
<img src="{avatar_url}" alt="评论头像">
</div>
<div class="nickname">{nickname}</div>
<div class="time">{time}</div>
<div class="message">{message}</div>
</div>
</div>
"""
return html_template, post_template, comment_template
# 格式化时间
def format_timestamp(timestamp):
time_struct = time.localtime(timestamp)
formatted_time = time.strftime("%Y年%m月%d%H:%M:%S", time_struct)
return formatted_time
# 判断json是否合法
def is_valid_json(json_data):
try:
json_object = json.loads(json_data) # 尝试解析JSON数据
return True # 解析成功是有效的JSON
except ValueError as e: # 解析失败,捕获异常
print(e)
return False # 解析失败不是有效的JSON
# 写入信息
def write_txt_file(workdir, file_name, data):
if not os.path.exists(workdir):
os.makedirs(workdir)
base_path_file_name = os.path.join(workdir, file_name)
with open(base_path_file_name, 'w', encoding='utf-8') as file:
file.write(data)
# 读取文件信息
def read_txt_file(workdir, file_name):
base_path_file_name = os.path.join(workdir, file_name)
if os.path.exists(base_path_file_name):
with open(base_path_file_name, 'r', encoding='utf-8') as file:
return file.read()
return None