业余选手写的网站邮箱提取，请各位大佬指教 ...

作者：24WOK 发布时间：2024-10-24 04:17:26

[Python] 纯文本查看复制代码import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import logging
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.exceptions import RequestException
# 设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
column_name = 'Website'
email_column = 'Email'
visited_links = set()  # 保存已访问的链接，避免重复访问
def extract_email_from_text(text):
"""从文本中提取邮箱地址，并处理常见的反爬虫技巧"""
email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
text = text.replace('[at]', '@').replace('[dot]', '.').replace('(at)', '@').replace('(dot)', '.')
return email_pattern.findall(text)
def get_all_links(soup, base_url):
"""获取页面中的所有链接，转换为绝对路径"""
links = set()
for link in soup.find_all('a', href=True):
      href = link['href']
      full_url = urljoin(base_url, href)
      if base_url in full_url and full_url not in visited_links:
         links.add(full_url)
return links
def find_impressum_link(soup, base_url):
"""查找包含 'Impressum' 或 'Kontakt' 的链接"""
links = soup.find_all('a', href=True)
for link in links:
      href = link['href']
      if any(keyword in href.lower() for keyword in ['impressum', 'kontakt', 'contact', 'about']):
         return urljoin(base_url, href)
return None
def find_email_near_keywords(soup):
"""通过关键字在页面文本中查找邮箱"""
keywords = ['email', 'e-mail', 'contact']
text = soup.get_text().lower()
for keyword in keywords:
      if keyword in text:
         emails = extract_email_from_text(text)
         if emails:
            return emails
return None
def get_email_from_url(url, depth=0):
"""从指定 URL 及其链接中提取邮箱地址"""
if url in visited_links or depth > 2:  # 限制递归深度
      return None
visited_links.add(url)
headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
try:
      response = requests.get(url, headers=headers, timeout=10, verify=False)
      if response.status_code == 200:
         soup = BeautifulSoup(response.content, 'lxml')
         # 1. 先尝试从页面文本中直接提取邮箱
         emails = extract_email_from_text(soup.get_text())
         if emails:
            return emails
         # 2. 尝试查找可能的 Impressum 或 Contact 链接
         impressum_link = find_impressum_link(soup, url)
         if impressum_link:
            logging.info(f"找到 Impressum 链接: {impressum_link}")
            email = get_email_from_url(impressum_link, depth + 1)  # 递归访问 Impressum 链接
            if email:
                  return email
         # 3. 在页面中关键字附近查找邮箱
         emails = find_email_near_keywords(soup)
         if emails:
            return emails
         # 4. 遍历页面中的所有链接，递归访问这些链接
         links = get_all_links(soup, url)
         for link in links:
            email = get_email_from_url(link, depth + 1)
            if email:
                  return email
except RequestException as e:
      logging.error(f"请求 URL 时发生错误: {e}")
return None
def process_url(url):
"""处理单个 URL，提取邮箱地址"""
if pd.notna(url) and url.strip() != '':
      url = 'https://' + url.strip()
      logging.info(f"访问: {url}")
      emails = get_email_from_url(url)
      if emails:
         email = emails[0]  # 如果找到多个邮箱，可以调整为选择最合适的
         logging.info(f"找到邮箱: {email}")
         return url, email
      else:
         logging.info("没有找到邮箱")
         return url, ''
else:
      logging.info(f"跳过空 URL 或无效 URL: {url}")
      return url, ''
# 使用上下文管理器来处理 Excel 文件
input_file = 'out3.xlsx'
output_file = 'out3_updated.xlsx'
try:
# 读取 Excel 文件
df = pd.read_excel(input_file, engine='openpyxl')
# 添加邮箱列
if email_column not in df.columns:
      df[email_column] = ''
# 使用线程池处理 URL
with ThreadPoolExecutor(max_workers=1000) as executor:
      future_to_url = {executor.submit(process_url, row[column_name]): index for index, row in df.iterrows()}
      for future in as_completed(future_to_url):
         index = future_to_url[future]
         try:
            url, email = future.result()
            df.at[index, email_column] = email
         except Exception as e:
            logging.error(f"处理 URL 时发生错误: {e}")
# 使用上下文管理器来写入 Excel 文件
with pd.ExcelWriter(output_file, engine='xlsxwriter') as excel_writer:
      df.to_excel(excel_writer, index=False)
      logging.info(f"处理完成，结果已保存到 {output_file}")
except Exception as e:
logging.error(f"处理 Excel 文件时发生错误: {e}")
写的迷迷糊糊的，勉强能用

邮箱, 链接

相关帖子

sir2008 2024-10-24 04:18:17

果然是写给大佬看的，我们还看不了，没软件也不知咋用。但必须得顶

mango1022 2024-10-24 04:19:12

虽然看不懂，但还是顶你

AiOutMan 2024-10-24 04:19:51

分析html源代码吗？用JS输出的怕是不可以吧，或者说[at]部分是用不同的颜色斜体粗细以及字体，那就是不是分析不出来，要不要用正则过滤掉

xrccdg 2024-10-24 04:20:29

with ThreadPoolExecutor(max_workers=1000) as executor:
1000,够霸气

Wasys 2024-10-24 04:21:26

with ThreadPoolExecutor(max_workers=1000) as executor:
1000确实大了，50就差不多了

milu1123 2024-10-24 04:22:17

没看懂URL是从那里给的，，，

yoga2joker 2024-10-24 04:22:50

谢谢分享，学习下

cxb2468 2024-10-24 04:23:20

D:\anaconda3\python.exe D:/1L/20240918/emailExtractor.py
2024-09-18 11:34:53,549 - INFO - 访问: https://https://mail.qq.com/
2024-09-18 11:35:04,899 - ERROR - 请求 URL 时发生错误: HTTPSConnectionPool(host='https', port=443): Max retries exceeded with url: //mail.qq.com/ (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
2024-09-18 11:35:04,899 - INFO - 没有找到邮箱
2024-09-18 11:35:04,917 - INFO - 处理完成，结果已保存到 out3_updated.xlsx
Website Email 两个字段到底填什么？

业余选手写的网站邮箱提取，请各位大佬指教

相关帖子

浏览过的版块

热门主题

我不是原作者，只是保存后转发

一个网站做十年，算不算工匠精神

求一份阿发你好Blender雕刻建模教程

bing的份额超过百度了吗

字节跳动将进军海外【游戏平台】市场

阿里云居然不允许BA的域名在米店出售

域名竟然被阿里云下架了

967117 com 有没有人要给钱就卖

阿里云一口价最后的成交

bing权重5的4年老年值多少

热门板块

公告

网站帮助 - Yoo趣儿

我们的愿景

在 Yoo趣儿投放广告

Yoo趣儿网站用户应遵守规则

业余选手写的网站邮箱提取，请各位大佬指教

相关帖子

浏览过的版块

热门主题

我不是原作者，只是保存后转发

一个网站做十年，算不算工匠精神

求一份阿发你好Blender雕刻建模教程

bing的份额超过百度了吗

字节跳动将进军海外【游戏平台】市场

阿里云居然不允许BA的域名在米店出售

域名竟然被阿里云下架了

967117 com 有没有人要 给钱就卖

阿里云一口价最后的成交

bing权重5的4年老年值多少

热门板块

公告

网站帮助 - Yoo趣儿

我们的愿景

在 Yoo趣儿 投放广告

Yoo趣儿网站用户应遵守规则

967117 com 有没有人要给钱就卖

在 Yoo趣儿投放广告