简单实现查找历史说说并导出为excel

This commit is contained in:
LibraHp_0928 2024-02-13 20:47:18 +08:00
parent 672b82746e
commit 9f5a6cc5d1
6 changed files with 27 additions and 9 deletions

17
main.py
View File

@ -2,12 +2,19 @@ from bs4 import BeautifulSoup
from tqdm import trange
import util.RequestUtil as Request
import util.ToolsUtil as Tools
import util.ConfigUtil as Config
import pandas as pd
if __name__ == '__main__':
user_info = Request.get_login_user_info()
user_nickname = user_info[Request.uin][6]
print(f"用户<{Request.uin}>,<{user_nickname}>登录成功")
texts = []
for i in trange(2, desc='Progress', unit='iteration'):
message = Request.get_message(i*100, 100).content.decode('utf-8')
for i in trange(1000, desc='Progress', unit='iteration'):
message = Request.get_message(i * 100, 100).content.decode('utf-8')
html = Tools.process_old_html(message)
if "li" not in html:
break
soup = BeautifulSoup(html, 'html.parser')
for element in soup.find_all('p', class_='txt-box-title ellipsis-one'):
text = element.get_text().replace('\xa0', ' ')
@ -16,6 +23,6 @@ if __name__ == '__main__':
# 创建一个DataFrame对象
df = pd.DataFrame(texts, columns=['内容'])
# 将DataFrame对象导出为CSV文件使用UTF-8编码
df.to_excel('output.xlsx', index=False)
print(texts)
# 导出为Excel
df.to_excel(Config.result_path + Request.uin + '.xlsx', index=False)
print('导出成功,请查看 ' + Config.result_path + Request.uin + '.xlsx')

View File

@ -1,3 +1,4 @@
[File]
temp = ./resource/temp/
user = ./resource/user/
user = ./resource/user/
result = ./resource/result/

View File

@ -6,6 +6,7 @@ config.read('./resource/config/config.ini')
temp_path = config.get('File', 'temp')
user_path = config.get('File', 'user')
result_path = config.get('File', 'result')
def save_user(cookies):

View File

@ -2,9 +2,9 @@ import requests
from PIL import Image
import time
import re
import os
import util.ConfigUtil as Config
def bkn(pSkey):
# 计算bkn
t, n, o = 5381, 0, len(pSkey)

View File

@ -1,6 +1,6 @@
import util.LoginUtil as Login
import requests
import json
# 登陆后获取到的cookies
cookies = Login.cookie()
# 获取g_tk
@ -52,3 +52,12 @@ def get_message(start, count):
response = requests.get('https://user.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds2_html_pav_all',
params=params, cookies=cookies, headers=headers)
return response
def get_login_user_info():
response = requests.get('https://r.qzone.qq.com/fcg-bin/cgi_get_portrait.fcg?g_tk=' + str(g_tk) + '&uins=' + uin,
headers=headers, cookies=cookies)
info = response.content.decode('GBK')
info = info.strip().lstrip('portraitCallBack(').rstrip(');')
info = json.loads(info)
return info

View File

@ -25,7 +25,7 @@ def process_old_html(message):
new_text = re.sub(r'\\x[0-9a-fA-F]{2}', replace_hex, message)
start_string = "html:'"
end_string = "\'\,opuin"
end_string = "',opuin"
new_text = extract_string_between(new_text, start_string, end_string)
new_text = replace_multiple_spaces(new_text).replace('\\', '')
return new_text