默认是下载前十条,我没做处理和反爬手段是正常请求,而且爬取不要过度爬取别人的网站,给网站造成困扰等
[Python] 纯文本查看 复制代码# 导入需要的模块
import requests
import os
import re
from lxml import etree
if not os.path.exists("music"):
os.mkdir("music")
cookies = {
'Hm_tf_t8h1bavogbi': '1757049760',
'Hm_lvt_t8h1bavogbi': '1759760559,1759853102',
'Hm_lpvt_t8h1bavogbi': '1760930516',
}
headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'origin': 'https://www.gequbao.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.gequbao.com/music/39466',
'sec-ch-ua': '"Microsoft Edge";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0',
'x-requested-with': 'XMLHttpRequest',
#'cookie': 'Hm_tf_t8h1bavogbi=1757049760; Hm_lvt_t8h1bavogbi=1759760559,1759853102; Hm_lpvt_t8h1bavogbi=1760930516',
}
# 定义一个函数
def get_songs(inp):
url = f"https://www.gequbao.com/s/{inp}"
response = requests.get(url, headers=headers, cookies=cookies)
# 解析网页内容
html = etree.HTML(response.text)
# 遍历列表
songs = html.xpath("//div[@class='card-text']/div")[1:11]
print(len(songs))
# 有一个列表,里面是歌曲的详细信息
links = []
for song in songs:
# 歌名
name = song.xpath(".//span[@class='text-primary font-weight-bolder music-title d-md-inline-block align-middle']/span/text()")[0]
# 详细链接
detail_url = song.xpath(".//a/@href")[0]
# 完整的请求链接 # https://www.gequbao.com/music/39466
full_url = "https://www.gequbao.com" + detail_url
# 将数据以字典形式加入列表(推荐)
links.append({"name": name, "url": full_url})
return links
def request_music(name, url):
response = requests.get(url, headers=headers)
# 解析网页内容
play_id = re.findall(r'"play_id":"(.*?)"', response.text)[0]
# 请求下载地址
download_url = "https://www.gequbao.com/api/play-url"
data = {
"id": play_id,
}
response = requests.post(download_url, headers=headers, data=data)
json_data = response.json()
play_url = json_data['data']['url']
# 下载歌曲
file_name = "music/" + name + ".mp3"
with open(file_name, "wb") as f:
f.write(requests.get(play_url).content)
print("下载完成")
if __name__ == '__main__':
# 输入搜索关键字
inp = input('请输入要采集的歌手:')
links = get_songs(inp)
for link in links:
name = link["name"]
url = link["url"]
# 下载歌曲
print(name, url)
request_music(name, url)