import os,requests,re,threading,time,json
url_list = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
}
directory = "txt" # 相对路径,将在当前工作目录下创建txt目录
if not os.path.exists(directory):
os.makedirs(directory)
def get_list(bookid):#获取章节列表
data = {"bookId": bookid}
r = requests.post("https://bookapi.zongheng.com/api/chapter/getChapterList", data=data, headers=headers)
response_data = json.loads(r.text)
# print(response_data["result"]["chapterList"]["chapterViewList"]["chapterId"])
chapter_list = response_data["result"]["chapterList"]
for chapter in chapter_list:
for chapte in chapter["chapterViewList"]:
chapterId = chapte["chapterId"]
url_list.append(f"https://read.zongheng.com/chapter/{bookid}/{chapterId}.html")
return True
def get_text(url,Lock:threading.Lock):#访问正文
p_text = ""
for ur in url:
#Lock.acquire() # 锁
r = requests.get(ur,headers=headers)
#Lock.release()
soup = BeautifulSoup(r.text, 'html.parser')
name = soup.find(class_="title_txtbox").text #标题
contents = soup.find('div', class_="content") #正文
content = contents.find_all("p")
for conten in content:
p_text += conten.text+"\n\n"
name = re.sub('[?|&]',"",name.strip()) #正则过滤内容
#将标题和内容写进去
file_name = os.path.join("txt",name+".txt")
sava_file(file_name,p_text)
time.sleep(1)
print(name)
def sava_file(name,text):
with open(name,"w",encoding="utf8") as f:
f.write(text)
Chapter = get_list("1249806") #访问章节
Lock = threading.Lock() #设置线程锁
print("长度:"+str(len(url_list)))
if Chapter:
# 计算每个子列表的长度
num = int(input("输入线程数:")) #线程数
Length = len(url_list) // num
urls = [url_list[i:i+num] for i in range(0,len(url_list),num)] #对列表进行切片为子列表
for url in urls:
threading.Thread(target=get_text, args=(url,Lock)).start()
有一点我不是很明白,我测试使用线程池并发CPU直接占满,但是使用threading多线程并发却不会
还有一点保存下来的文件排序也不好排,如果设置线程锁,确实能按照排序,但是其他线程阻塞导致就像单线程一样慢