之前写市场分析报告时候写的脚本, 输入关键词, 调整爬取的博文时间, 即可运行使用
使用说明
需要自己去获取自己账号的cookies, 放到写好的cookies位置
代码
import datetime
import json
import random
import re
import time
import traceback
import pymysql
import requests
from lxml import etree
import urllib3
import openpyxl
urllib3.disable_warnings()
import random
# 标识头
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'referer': 'https://www.google.com/',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(user_agents)
}
# 获取当前的Url
def get_html(url):
num = 0
while True:
num += 1
try:
print("当前请求url:", url)
time.sleep(2)
headers[
'cookie'] = '这里需要你的账号的cookies'
response = requests.get(url, headers=headers, timeout=10, verify=False, proxies='')
if response.status_code == 200:
return response
elif response.status_code == 404:
return ''
else:
print('请求响应吗错误: {} 请求url{} 重新请求'.format(response.status_code, url))
except Exception as e:
print("等待代{过}{滤}理更新")
time.sleep(10)
pass
# 编码
def decodeContent(html):
import cchardet as chardet
gbk_list = ["gb2312", "GB2312", "GBK", "GB18030"]
if isinstance(html, bytes):
char = chardet.detect(html)
confidence = char['confidence']
if "encoding" in char and confidence > 0.7:
items = [char["encoding"]]
else:
items = re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html))
if not items:
items = re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html))
if not items:
items = re.compile(r'charset=(.*?)[\'\"]').findall(str(html))
if items:
charset = 'gbk' if items[0] in gbk_list else items[0]
try:
res = html.decode(charset)
except Exception as e:
if charset == 'gbk':
try:
res = html.decode('gbk', 'ignore')
except Exception as e:
res = ""
else:
try:
res = html.decode('utf-8', 'ignore')
except Exception as e:
res = ""
else:
try:
res = html.decode('utf-8')
except Exception as e:
try:
res = html.decode('gbk')
except Exception as e:
try:
res = html.decode('utf-8', 'ignore')
except Exception as e:
res = ""
return res
return html
# 提取网页内容, 并存储到工作簿中
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Sheet1'
ws.append((["content"]))
def comment_info(res,keyword):
try:
contents_lis = res.xpath(
'//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]')
digg = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]')
user_lis = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]')
print(len(contents_lis))
for index, i in enumerate(contents_lis):
try:
content = ''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip()
print("@@@@@@@@@@@@@@", content)
result_list = [content]
ws.append((result_list))
wb.save('weibo_info.xlsx')
except:
traceback.print_exc()
except:
pass
# 时间表示
def time_end_start(i, start_time):
aaa = datetime.datetime.strptime(start_time, '%Y-%m-%d')
threeDayAgo = (aaa + datetime.timedelta(days=i))
threeDayAgosss = (threeDayAgo - datetime.timedelta(days=1))
return threeDayAgo, threeDayAgosss
# 程序进程
def run(lkll):
# 关键词
lis = [lkll]
# 开始时间结束时间
start_time = "2021-01-01"
end_time = "2022-01-01"
d1 = datetime.datetime.strptime(start_time, '%Y-%m-%d')
d2 = datetime.datetime.strptime(end_time, '%Y-%m-%d')
delta = d2 - d1
ccc = delta.days
print(ccc)
for i in range(0, int(ccc) + 1):
tim, threeDayAgosss = time_end_start(i, start_time)
tim = str(tim).replace("00:00:00", "").replace(" ", "")
threeDayAgosss = str(threeDayAgosss).replace("00:00:00", "").replace(" ", "")
print(tim)
if tim:
for j in lis:
print(tim, threeDayAgosss,j)
get_page(tim, threeDayAgosss, j)
else:
time.sleep(60)
# 通过给定信息获取Url
def get_page(tim, threeDayAgosss, j):
page = 1
while True:
try:
print("________________当前第{}页_______________".format(page))
url = 'https://s.weibo.com/weibo?q={}&typeall=1&suball=1×cope=custom:{}:{}&Refer=g&page={}'.format(j,
threeDayAgosss + '-0',
tim + '-0',
page)
print("############", url)
res = get_html(url)
res = etree.HTML(res.text)
comment_info(res,j)
pagss = ''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()"))
print("!!!!!!!", pagss)
pages = pagss.replace("第", '').replace("页", '')
print(pages)
if pages:
if page
使用截图