为了区分整理我把单个题目作为了文件夹,包括材料和问题。
[Python] 纯文本查看 复制代码import requests
from pprint import pprint as pp
from lxml import etree
import os
from pathlib import Path
s=requests.Session()
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"cache-control": "no-cache",
"origin": "https://spa.fenbi.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://spa.fenbi.com/",
"sec-ch-ua": "\"Microsoft Edge\";v=\"137\", \"Chromium\";v=\"137\", \"Not/A)Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0"
}
cookies = {
"sajssdk_2015_cross_new_user": "1",
"sid": "1647307",
"acw_tc": "0bd17c1217490332588544632efe03abc97e040afed07fd6d541b713c47dae",
"Hm_lvt_e7351028cde0d0ccb9ccdbe5fe531683": "1749033310",
"HMACCOUNT": "6993B8743513AD1A",
"userid": "82533497",
"sess": "hFjQMwOON2rzkwQR3QczjFf7H44+jXDXfc/NgWPyJSI3NH2EukBfOY4pRG5zI3ukf+CGNsMmu+h1PqXKX/eS00mA2MmQqOfoMhtw/JjRVLs=",
"persistent": "Vd1+6HIhRsiOgL1GRyrD5/SLk756pPaK9CuHNR2dAmcfoHSZrgpL2Vtzmd/3hi+LvXxKD/LnoSBEv9czCo5xYQ==",
"sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%2282533497%22%2C%22first_id%22%3A%221973a829598657-08eeb0011facd8-4c657b58-1638720-1973a8295991f14%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fcn.bing.com%2F%22%7D%2C%22%24device_id%22%3A%221973a2f890c99d-0f8d3d7a7e2f018-4c657b58-1638720-1973a2f890d1407%22%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTk3M2E4MjU0MmIyNTEtMGRmOGM4OTYwNGEyYTktNGM2NTdiNTgtMTYzODcyMC0xOTczYTgyNTQyYzFjZmYiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI4MjUzMzQ5NyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2282533497%22%7D%7D",
"Hm_lpvt_e7351028cde0d0ccb9ccdbe5fe531683": "1749033379"
}
url = "https://tiku.fenbi.com/api/shenlun/subLabels"
params = {
"kav": "100",
"av": "100",
"hav": "100",
"app": "web"
}
data_list = s.get(url, headers=headers, cookies=cookies, params=params).json()
# print(data_list)
result_dict = {item['name']: item['labelMeta']['paperIds'] for item in data_list}
mode=input('请输入您想查找的题库范围 小模考,国考,选调生,公安招警和公检法,安徽,北京,等地区关键词 :')
print(result_dict[mode])
# numlabel_id = next((item['labelMeta']['id'] for item in data_list if item.get('name') == '国考'), None)
# print(numlabel_id)
url_0='https://tiku.fenbi.com/api/shenlun/papers/'
url_list=[url_0 + str(item) for item in result_dict[mode]]
for url in url_list:
data_2 = s.get(url, headers=headers, cookies=cookies, params=params).json()
name = data_2['name']
date = data_2['date']
topic = data_2['topic']
title = (name + date + topic)
print(title)
os_path = Path.cwd() / mode / title
print(os_path)
# 检查目录是否存在,如果不存在则创建
if not os_path.exists():
os_path.mkdir(parents=True, exist_ok=True)
params_2 = {'type': '7',
'id': data_2['id'],
'checkId': data_2['encodeCheckInfo'],
**params}
# print(params_2)
url_3 = 'https://tiku.fenbi.com/api/shenlun/universal/auth/solutions?'
finaldata = s.get(url_3, headers=headers, cookies=cookies, params=params_2).json()
materials = finaldata['materials']
solutions = finaldata['solutions']
materials_content = [item['content'] for item in materials]
solutions_content = [item['content'] for item in solutions]
print(f'正在下载...')
# 假设 materials_content 已经定义好,例如:
# materials_content = ['第一段
第二段
', '第三段
第四段
']
with open(f'{os_path}\materials.txt', 'w', encoding='utf-8') as f:
for key, material in enumerate(materials_content): # 处理每一则材料
f.write(f'材料{key + 1}: --------------\n')
# 将HTML字符串转换为Element对象
material = etree.HTML(material)
# 使用XPath选择所有的元素
p_elements = material.xpath('//p')
# print(p_elements) # 打印所有找到的元素
# 遍历每个元素,提取其文本内容
for p_element in p_elements: # 处理一则材料下的每一个段落
if p_element.text:
# 提取元素内的所有文本节点
p_content = p_element.xpath('.//text()')
# 将文本节点合并成一个字符串,并去除首尾空白字符
p_content = ''.join(p_content).strip()
print(p_content) # 打印提取的文本内容
f.write(p_content + '\n') # 将文本内容写入文件
f.write('\n') # 每处理完一则材料后,在文件中添加一个空行作为分隔
# 假设 solutions_content 已经定义好,例如:
# solutions_content = ['第一段
', '第二段
']
with open(f'{os_path}\solution.txt', 'w', encoding='utf-8') as f:
for line in solutions_content:
# 将HTML字符串转换为Element对象
html = etree.HTML(line)
# 使用XPath选择所有的元素的文本内容
content = html.xpath('.//p/b/text()')
# 打印提取的内容
print(''.join(content))
# 将提取的内容写入文件,并添加换行符
f.write(''.join(content) + '\n\n')
print(f'已经完成...')