【爬虫学习】python采集cp网支持率

查看 73|回复 6
作者:dabiaoge144   
这是前几天接的某个单子,发出来分享仅供学习
功能:采集网站信息,存到excel
[url=]image.png[/url]
import requests
import pandas as pd
from datetime import datetime, timedelta
def fetch_match_data(date, url, headers):
params = {
"matchPage": "1",
"matchBeginDate": date,
"matchEndDate": date,
"leagueId": "",
"pageSize": "100",
"pageNo": "1",
"isFix": "0",
"pcOrWap": "1"
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()
else:
return None
def parse_match_data(response_json):
if response_json["errorCode"] == "0" and response_json["value"]["resultCount"] > 0:
matches = response_json["value"].get("matchResult", [])
Time1 = [match["matchDate"] for match in matches if "matchDate" in match]
mathstr = [match["matchNumStr"] for match in matches if "matchNumStr" in match]
TimeId = [match["matchId"] for match in matches if "matchId" in match]
LeagueName = [match["leagueNameAbbr"] for match in matches if "leagueNameAbbr" in match]
Home = [match["homeTeam"] for match in matches if "homeTeam" in match]
Away = [match["awayTeam"] for match in matches if "awayTeam" in match]
rang = [match["goalLine"] for match in matches if "goalLine" in match]
non_empty_indices = [i for i, x in enumerate(rang) if x != '']
return {
"Time1": [Time1 for i in non_empty_indices],
"mathstr": [mathstr for i in non_empty_indices],
"TimeId": [TimeId for i in non_empty_indices],
"LeagueName": [LeagueName for i in non_empty_indices],
"Home": [Home for i in non_empty_indices],
"Away": [Away for i in non_empty_indices],
"rang": [rang for i in non_empty_indices],
}
else:
print(f"没有比赛或数据错误: {response_json.get('errorMessage', '未知错误')}")
return None
def fetch_support_rates(match_ids, url2, headers):
params2 = {
"matchIds": ",".join(map(str, match_ids)),
"poolCode": "hhad,had",
"sportType": "1"
}
response = requests.get(url2, headers=headers, params=params2)
print(response.text)
if response.status_code == 200:
    return response.json()
else:
    return None
def fetch_result_rates(data, url3, headers):
params = {
"matchPage": "1",
"matchBeginDate": data,
"matchEndDate": data,
"leagueId": "",
"pageSize": "100",
"pageNo": "1",
"isFix": "0",
"pcOrWap": "1"
}
response = requests.get(url3, headers=headers, params=params)
if response.status_code == 200:
return response.json()
else:
return None
def combine_data(parsed_match_data, support_data):
extracted_data = {
"时间": parsed_match_data["Time1"],
"TimeId": parsed_match_data["mathstr"],
"联赛": parsed_match_data["LeagueName"],
"主队": parsed_match_data["Home"],
"客队": parsed_match_data["Away"],
"让球": parsed_match_data["rang"],
"胜支持率": [],
"平支持率": [],
"负支持率": [],
"让胜支持率": [],
"让平支持率": [],
"让负支持率": []
}
for match_id, match_details in support_data["value"].items():
hhad_data = match_details.get("HHAD", {})
had_data = match_details.get("HAD", {})
extracted_data["胜支持率"].append(had_data.get("hSupportRate", ""))
extracted_data["平支持率"].append(had_data.get("dSupportRate", ""))
extracted_data["负支持率"].append(had_data.get("aSupportRate", ""))
extracted_data["让胜支持率"].append(hhad_data.get("hSupportRate", ""))
extracted_data["让平支持率"].append(hhad_data.get("dSupportRate", ""))
extracted_data["让负支持率"].append(hhad_data.get("aSupportRate", ""))
# 检查列的长度是否一致
max_len = max(len(v) for v in extracted_data.values())
for key, value in extracted_data.items():
    if len(value)
def save_to_excel(dataframe, file_path):
if not dataframe.empty:
dataframe.to_excel(file_path, index=False)
print(f"数据已保存在 {file_path}")
else:
print("没有数据可保存。")
def fetch_football_data(start_date, end_date, url, url2, headers, excel_path="最新支持率.xlsx"):
all_data = pd.DataFrame()
for single_date in (start_date + timedelta(n) for n in range((end_date - start_date).days + 1)):
date_str = single_date.strftime('%Y-%m-%d')
print(f"正在爬取 {date_str} 的数据...")
match_data = fetch_match_data(date_str, url, headers)
print(match_data)第一次采集数据
    if match_data:
        parsed_match_data = parse_match_data(match_data)
        if parsed_match_data:
            match_ids = parsed_match_data["TimeId"]
            print(match_ids)
            support_data = fetch_support_rates(match_ids, url2, headers)
            result_data = fetch_result_rates(date_str,url3, headers)
            # print(result_data)
            # print(support_data)
            if support_data:
                # 合并比赛数据和支持率数据
                combined_df = combine_data(parsed_match_data, support_data)
                # print(combined_df)就是表头
                # 按照 TimeId 排序
                sorted_indices = sorted(range(len(parsed_match_data["TimeId"])),
                                        key=lambda k: int(parsed_match_data["TimeId"][k]))
                # 根据排序索引重排相关数据
                Time1_sorted1 = [parsed_match_data["Time1"] for i in sorted_indices]
                mathstr2 = [parsed_match_data["mathstr"] for i in sorted_indices]
                TimeId_sorted1 = [parsed_match_data["TimeId"] for i in sorted_indices]
                LeagueName_sorted1 = [parsed_match_data["LeagueName"] for i in sorted_indices]
                Home_sorted1 = [parsed_match_data["Home"] for i in sorted_indices]
                Away_sorted1 = [parsed_match_data["Away"] for i in sorted_indices]
                rang_sorted = [parsed_match_data["rang"] for i in sorted_indices]
                # 更新数据框
                sorted_data = pd.DataFrame({
                    "Time1": Time1_sorted1,
                    "mathstr": mathstr2,
                    "TimeId": TimeId_sorted1,
                    "LeagueName": LeagueName_sorted1,
                    "Home": Home_sorted1,
                    "Away": Away_sorted1,
                    "胜支持率": combined_df["胜支持率"],
                    "平支持率": combined_df["平支持率"],
                    "负支持率": combined_df["负支持率"],
                    "rang": rang_sorted,
                    "让胜支持率": combined_df["让胜支持率"],
                    "让平支持率": combined_df["让平支持率"],
                    "让负支持率": combined_df["让负支持率"]
                })
                if result_data:
                    print(result_data)
                # 合并所有数据
                all_data = pd.concat([all_data, sorted_data], ignore_index=True)
save_to_excel(all_data, excel_path)
Example usage:
if name == 'main':
headers = {
}
start_date = datetime(2024, 11, 28)
end_date = datetime(2024, 11, 29)
url = aHR0cHM6Ly93ZWJhcGkuc3BvcnR0ZXJ5LmNuL2dhdGV3YXkvamMvZm9vdGJhbGwvZ2V0TWF0Y2hSZXN1bHRWMS5xcnk=
url2 = aHR0cHM6Ly93ZWJhcGkuc3BvcnR0ZXJ5LmNuL2dhdGV3YXkvamMvY29tbW9uL2dldFN1cHBvcnRSYXRlVjEucXJ5
url3 = aHR0cHM6Ly93ZWJhcGkuc3BvcnR0ZXJ5LmNuL2dhdGV3YXkvamMvZm9vdGJhbGwvZ2V0TWF0Y2hSZXN1bHRWMS5xcnk=
fetch_football_data(start_date, end_date, url, url2, headers)

支持率, 数据

rt007   

代码插入方法请看置顶帖,使用md格式需要加代码标记
Yhuo   

谢谢大佬分享
ka52   

谢谢大佬分享
cyfwapj   

好东西值得点赞
chanhaofan1118   

足彩很有帮助
rt007   

好东西值得点赞
您需要登录后才可以回帖 登录 | 立即注册