python微博爬虫

作者：han5562877 发布时间：2023-1-24 12:51:37

微博博文爬取
之前写市场分析报告时候写的脚本, 输入关键词, 调整爬取的博文时间, 即可运行使用
使用说明
需要自己去获取自己账号的cookies, 放到写好的cookies位置
代码
import datetime
import json
import random
import re
import time
import traceback
import pymysql
import requests
from lxml import etree
import urllib3
import openpyxl
urllib3.disable_warnings()
import random
# 标识头
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2 ",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
# 'Cache-Control': 'no-cache',
# 'Connection': 'keep-alive',
# 'referer': 'https://www.google.com/',
# 'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(user_agents)
}
# 获取当前的Url
def get_html(url):
num = 0
while True:
      num += 1
      try:
         print("当前请求url:", url)
         time.sleep(2)
         headers[
            'cookie'] = '这里需要你的账号的cookies'
         response = requests.get(url, headers=headers, timeout=10, verify=False, proxies='')
         if response.status_code == 200:
            return response
         elif response.status_code == 404:
            return ''
         else:
            print('请求响应吗错误: {}  请求url{}  重新请求'.format(response.status_code, url))
      except Exception as e:
         print("等待代{过}{滤}理更新")
         time.sleep(10)
         pass
# 编码
def decodeContent(html):
import cchardet as chardet
gbk_list = ["gb2312", "GB2312", "GBK", "GB18030"]
if isinstance(html, bytes):
      char = chardet.detect(html)
      confidence = char['confidence']
      if "encoding" in char and confidence > 0.7:
         items = [char["encoding"]]
      else:
         items = re.compile(r'charset=([^\'\"]*?)[\'\"/\s]*?>').findall(str(html))
         if not items:
            items = re.compile(r'charset=[\'\"](.*?)[\'\"]').findall(str(html))
         if not items:
            items = re.compile(r'charset=(.*?)[\'\"]').findall(str(html))
      if items:
         charset = 'gbk' if items[0] in gbk_list else items[0]
         try:
            res = html.decode(charset)
         except Exception as e:
            if charset == 'gbk':
                  try:
                     res = html.decode('gbk', 'ignore')
                  except Exception as e:
                     res = ""
            else:
                  try:
                     res = html.decode('utf-8', 'ignore')
                  except Exception as e:
                     res = ""
      else:
         try:
            res = html.decode('utf-8')
         except Exception as e:
            try:
                  res = html.decode('gbk')
            except Exception as e:
                  try:
                     res = html.decode('utf-8', 'ignore')
                  except Exception as e:
                     res = ""
      return res
return html
# 提取网页内容, 并存储到工作簿中
wb = openpyxl.Workbook()
ws = wb.active
ws.title = 'Sheet1'
ws.append((["content"]))
def comment_info(res,keyword):
try:
      contents_lis = res.xpath(
         '//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="content"]')
      digg = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="card-act"]')
      user_lis = res.xpath('//div[@id="pl_feedlist_index"]/div[2]//div[@class="card-wrap"]//div[@class="avator"]')
      print(len(contents_lis))
      for index, i in enumerate(contents_lis):
         try:
            content = ''.join(i.xpath('p[@node-type="feed_list_content"]//text()')).replace("\n",'').strip()
            print("@@@@@@@@@@@@@@", content)
            result_list = [content]
            ws.append((result_list))
            wb.save('weibo_info.xlsx')
         except:
            traceback.print_exc()
except:
      pass
# 时间表示
def time_end_start(i, start_time):
aaa = datetime.datetime.strptime(start_time, '%Y-%m-%d')
threeDayAgo = (aaa + datetime.timedelta(days=i))
threeDayAgosss = (threeDayAgo - datetime.timedelta(days=1))
return threeDayAgo, threeDayAgosss
# 程序进程
def run(lkll):
# 关键词
lis = [lkll]
# 开始时间结束时间
start_time = "2021-01-01"
end_time = "2022-01-01"
d1 = datetime.datetime.strptime(start_time, '%Y-%m-%d')
d2 = datetime.datetime.strptime(end_time, '%Y-%m-%d')
delta = d2 - d1
ccc = delta.days
print(ccc)
for i in range(0, int(ccc) + 1):
      tim, threeDayAgosss = time_end_start(i, start_time)
      tim = str(tim).replace("00:00:00", "").replace(" ", "")
      threeDayAgosss = str(threeDayAgosss).replace("00:00:00", "").replace(" ", "")
      print(tim)
      if tim:
         for j in lis:
            print(tim, threeDayAgosss,j)
            get_page(tim, threeDayAgosss, j)
      else:
         time.sleep(60)
# 通过给定信息获取Url
def get_page(tim, threeDayAgosss, j):
page = 1
while True:
      try:
         print("________________当前第{}页_______________".format(page))
         url = 'https://s.weibo.com/weibo?q={}&typeall=1&suball=1&timescope=custom:{}:{}&Refer=g&page={}'.format(j,
                                                                                                                  threeDayAgosss + '-0',
                                                                                                                  tim + '-0',
                                                                                                                  page)
         print("############", url)
         res = get_html(url)
         res = etree.HTML(res.text)
         comment_info(res,j)
         pagss = ''.join(res.xpath("//div[@class='m-page']/div/span/ul/li[last()]//text()"))
         print("！！！！！！！", pagss)
         pages = pagss.replace("第", '').replace("页", '')
         print(pages)
         if pages:
            if page
使用截图

爬虫, 博文

python微博爬虫

相关帖子

浏览过的版块

热门主题

求推荐 300 价位电视盒子

[分享] 纯前端撸了一个「交互式哺乳动物演

她说离婚原因是男的挣不到钱

这是缅北开发的游戏…

短视频里，高速现在都开始堵车了啊，大家都

百度统计或者cnzz的数据，来源分析，直接访

必应移动端有流量的吗？

发现了很多这种后缀的的百度收录，是怎么做

我看那个huoban网站权重被百度干没了。。。

按 Ctrl 好累？我不是一个人吧

热门板块

公告

网站帮助 - Yoo趣儿

我们的愿景

在 Yoo趣儿投放广告

Yoo趣儿网站用户应遵守规则

python微博爬虫

相关帖子

浏览过的版块

热门主题

求推荐 300 价位电视盒子

[分享] 纯前端撸了一个「交互式哺乳动物演

她说离婚原因是男的挣不到钱

这是缅北开发的游戏…

短视频里，高速现在都开始堵车了啊，大家都

百度统计或者cnzz的数据，来源分析，直接访

必应移动端有流量的吗？

发现了很多这种后缀的的百度收录，是怎么做

我看那个huoban网站权重被百度干没了。。。

按 Ctrl 好累？我不是一个人吧

热门板块

公告

网站帮助 - Yoo趣儿

我们的愿景

在 Yoo趣儿 投放广告

Yoo趣儿网站用户应遵守规则

在 Yoo趣儿投放广告