步骤:使用py的requests和beautifulsoup4库来抓取数据和解析XML文件,再用正则表达式匹配URL的后缀名和页面内容。 保存到一个txt文本文本中。
[Python] 纯文本查看 复制代码import requests
from bs4 import BeautifulSoup
import re
# 请求XML文件
url = 'https://www.xxxxxxxxxxxxxxxx/sitemap.xml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
# 解析XML文件
soup = BeautifulSoup(response.text, 'xml')
urls = [loc.text for loc in soup.find_all('loc')]
# 匹配URL的后缀名为.html的网页并获取title、description和keywords
pattern = re.compile(r'.*\.html$')
for url in urls:
if re.match(pattern, url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.title.string
keywords = soup.find('meta', attrs={'name': 'keywords'})['content']
description = soup.find('meta', attrs={'name': 'description'})['content']
# 保存到txt文件
with open('output.txt', 'a', encoding='utf-8') as f:
f.write('Title: {}\n'.format(title))
f.write('Keywords: {}\n'.format(keywords))
f.write('Description: {}\n'.format(description))
f.write('URL: {}\n\n'.format(url))