简单实现查找历史说说并导出为excel

This commit is contained in:
LibraHp_0928 2024-02-13 20:47:18 +08:00
parent 672b82746e
commit 9f5a6cc5d1
6 changed files with 27 additions and 9 deletions

17
main.py
View File

@ -2,12 +2,19 @@ from bs4 import BeautifulSoup
from tqdm import trange from tqdm import trange
import util.RequestUtil as Request import util.RequestUtil as Request
import util.ToolsUtil as Tools import util.ToolsUtil as Tools
import util.ConfigUtil as Config
import pandas as pd import pandas as pd
if __name__ == '__main__': if __name__ == '__main__':
user_info = Request.get_login_user_info()
user_nickname = user_info[Request.uin][6]
print(f"用户<{Request.uin}>,<{user_nickname}>登录成功")
texts = [] texts = []
for i in trange(2, desc='Progress', unit='iteration'): for i in trange(1000, desc='Progress', unit='iteration'):
message = Request.get_message(i*100, 100).content.decode('utf-8') message = Request.get_message(i * 100, 100).content.decode('utf-8')
html = Tools.process_old_html(message) html = Tools.process_old_html(message)
if "li" not in html:
break
soup = BeautifulSoup(html, 'html.parser') soup = BeautifulSoup(html, 'html.parser')
for element in soup.find_all('p', class_='txt-box-title ellipsis-one'): for element in soup.find_all('p', class_='txt-box-title ellipsis-one'):
text = element.get_text().replace('\xa0', ' ') text = element.get_text().replace('\xa0', ' ')
@ -16,6 +23,6 @@ if __name__ == '__main__':
# 创建一个DataFrame对象 # 创建一个DataFrame对象
df = pd.DataFrame(texts, columns=['内容']) df = pd.DataFrame(texts, columns=['内容'])
# 将DataFrame对象导出为CSV文件使用UTF-8编码 # 导出为Excel
df.to_excel('output.xlsx', index=False) df.to_excel(Config.result_path + Request.uin + '.xlsx', index=False)
print(texts) print('导出成功,请查看 ' + Config.result_path + Request.uin + '.xlsx')

View File

@ -1,3 +1,4 @@
[File] [File]
temp = ./resource/temp/ temp = ./resource/temp/
user = ./resource/user/ user = ./resource/user/
result = ./resource/result/

View File

@ -6,6 +6,7 @@ config.read('./resource/config/config.ini')
temp_path = config.get('File', 'temp') temp_path = config.get('File', 'temp')
user_path = config.get('File', 'user') user_path = config.get('File', 'user')
result_path = config.get('File', 'result')
def save_user(cookies): def save_user(cookies):

View File

@ -2,9 +2,9 @@ import requests
from PIL import Image from PIL import Image
import time import time
import re import re
import os
import util.ConfigUtil as Config import util.ConfigUtil as Config
def bkn(pSkey): def bkn(pSkey):
# 计算bkn # 计算bkn
t, n, o = 5381, 0, len(pSkey) t, n, o = 5381, 0, len(pSkey)

View File

@ -1,6 +1,6 @@
import util.LoginUtil as Login import util.LoginUtil as Login
import requests import requests
import json
# 登陆后获取到的cookies # 登陆后获取到的cookies
cookies = Login.cookie() cookies = Login.cookie()
# 获取g_tk # 获取g_tk
@ -52,3 +52,12 @@ def get_message(start, count):
response = requests.get('https://user.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds2_html_pav_all', response = requests.get('https://user.qzone.qq.com/proxy/domain/ic2.qzone.qq.com/cgi-bin/feeds/feeds2_html_pav_all',
params=params, cookies=cookies, headers=headers) params=params, cookies=cookies, headers=headers)
return response return response
def get_login_user_info():
response = requests.get('https://r.qzone.qq.com/fcg-bin/cgi_get_portrait.fcg?g_tk=' + str(g_tk) + '&uins=' + uin,
headers=headers, cookies=cookies)
info = response.content.decode('GBK')
info = info.strip().lstrip('portraitCallBack(').rstrip(');')
info = json.loads(info)
return info

View File

@ -25,7 +25,7 @@ def process_old_html(message):
new_text = re.sub(r'\\x[0-9a-fA-F]{2}', replace_hex, message) new_text = re.sub(r'\\x[0-9a-fA-F]{2}', replace_hex, message)
start_string = "html:'" start_string = "html:'"
end_string = "\'\,opuin" end_string = "',opuin"
new_text = extract_string_between(new_text, start_string, end_string) new_text = extract_string_between(new_text, start_string, end_string)
new_text = replace_multiple_spaces(new_text).replace('\\', '') new_text = replace_multiple_spaces(new_text).replace('\\', '')
return new_text return new_text