我用python37正常运行。先把书签导出html。再导出csv。csv里有url链接。
形如:
[Python] 纯文本查看 复制代码标题,链接
123,http://www.123.com
挂机一晚上,收了几万个网页。
书签导csv.py
[Python] 纯文本查看 复制代码import os
import csv
from bs4 import BeautifulSoup
# 步骤 1: 获取 HTML 文件路径
html_path = input("请输入书签 HTML 文件的路径: ")
# 检查文件是否存在
if not os.path.exists(html_path):
print(f"错误: 文件 '{html_path}' 不存在。")
exit()
print(f"步骤 1: 成功获取 HTML 文件路径: {html_path}")
# 步骤 2: 读取 HTML 文件内容
try:
with open(html_path, 'r', encoding='utf-8') as file:
html_content = file.read()
print("步骤 2: 成功读取 HTML 文件内容。")
except Exception as e:
print(f"错误: 读取 HTML 文件时出错: {e}")
exit()
# 步骤 3: 解析 HTML 并提取链接
soup = BeautifulSoup(html_content, 'html.parser')
links = soup.find_all('a')
if not links:
print("警告: 在 HTML 文件中未找到任何链接。")
exit()
bookmark_data = []
for link in links:
name = link.get_text(strip=True)
url = link.get('href', '')
if name and url:
bookmark_data.append([name, url])
print(f"步骤 3: 成功从 HTML 文件中提取 {len(bookmark_data)} 个书签链接。")
# 步骤 4: 生成 CSV 文件路径
html_dir = os.path.dirname(html_path)
html_filename = os.path.basename(html_path)
csv_filename = os.path.splitext(html_filename)[0] + '.csv'
csv_path = os.path.join(html_dir, csv_filename)
# 步骤 5: 写入 CSV 文件
try:
with open(csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['网站', '链接']) # 写入表头
writer.writerows(bookmark_data) # 写入数据
print(f"步骤 5: 成功将书签数据保存到 CSV 文件: {csv_path}")
print(f"CSV 文件包含 {len(bookmark_data)} 条记录。")
except Exception as e:
print(f"错误: 写入 CSV 文件时出错: {e}")
exit()
链接导网页.py
[Python] 纯文本查看 复制代码import csv
import os
import requests
import concurrent.futures
from urllib.parse import urlparse
import time
from datetime import datetime
# 配置信息
MAX_WORKERS = 5 # 最大线程数
TIMEOUT = 10 # 请求超时时间(秒)
RETRIES = 3 # 重试次数
DELAY = 1 # 请求间隔时间(秒)
# 设置常用的 User-Agent 列表,模拟不同浏览器
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36 Edg/91.0.864.71',
]
def get_random_user_agent():
"""获取随机的 User-Agent"""
import random
return random.choice(USER_AGENTS)
def create_directory(directory):
"""创建目录(如果不存在)"""
if not os.path.exists(directory):
try:
os.makedirs(directory)
print(f"创建目录: {directory}")
except OSError as e:
print(f"无法创建目录 {directory}: {e}")
return False
return True
def sanitize_filename(filename):
"""清理文件名,移除不合法字符"""
invalid_chars = ':"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename
def process_url(row, url_column, error_file):
"""处理单个 URL"""
url = row[url_column]
print(f"开始处理 URL: {url}")
# 解析 URL 获取域名和路径
try:
parsed_url = urlparse(url)
domain = parsed_url.netloc
path = parsed_url.path
# 如果路径为空或只有斜杠,使用默认文件名
if not path or path == '/':
filename = 'index.html'
else:
# 使用路径的最后一部分作为文件名
parts = path.strip('/').split('/')
if '.' in parts[-1] and len(parts[-1].split('.')) > 1:
# 如果最后一部分包含扩展名,使用它
filename = parts[-1]
else:
# 否则添加 .html 扩展名
filename = f"{parts[-1]}.html" if parts[-1] else "index.html"
# 清理文件名
filename = sanitize_filename(filename)
# 创建域名文件夹
domain_dir = os.path.join(os.getcwd(), domain)
if not create_directory(domain_dir):
raise Exception(f"无法创建目录: {domain_dir}")
# 构建保存路径
save_path = os.path.join(domain_dir, filename)
# 尝试访问 URL
for attempt in range(RETRIES):
try:
headers = {
'User-Agent': get_random_user_agent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers, timeout=TIMEOUT)
response.raise_for_status() # 检查 HTTP 状态码
# 自动检测编码并获取内容
response.encoding = response.apparent_encoding
content = response.text
# 保存网页内容
with open(save_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"成功保存: {save_path} (编码: {response.encoding})")
return True
except requests.exceptions.RequestException as e:
print(f"尝试 {attempt + 1}/{RETRIES} 访问 {url} 失败: {e}")
if attempt
网页导md.py
[Python] 纯文本查看 复制代码import os
import chardet
import markdownify
from bs4 import BeautifulSoup
from urllib.parse import unquote
# 定义最大文件名长度(根据不同系统调整)
MAX_FILENAME_LENGTH = 255
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']
def get_html_title(html_content, encoding):
try:
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.title
return title.text.strip() if title else None
except Exception:
return None
def truncate_filename(filename, max_length):
"""截断文件名以适应系统限制"""
if len(filename) 0:
return f"{name[:available_length]}_trunc{ext}"
# 如果扩展名很长,只保留扩展名
return f"file_{os.urandom(4).hex()}{ext}"
def convert_html_to_md(html_path):
try:
# 检测文件编码
encoding = detect_encoding(html_path)
if not encoding:
encoding = 'utf-8' # 默认使用utf-8
# 读取HTML内容
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
html_content = f.read()
# 获取HTML标题作为MD文件名
title = get_html_title(html_content, encoding)
# 解码HTML文件名中的URL编码
html_filename = os.path.basename(html_path)
decoded_html_filename = unquote(html_filename)
html_basename = os.path.splitext(decoded_html_filename)[0]
# 替换非法字符
invalid_chars = ':"/\\|?*'
for char in invalid_chars:
html_basename = html_basename.replace(char, '_')
if not title:
# 如果没有标题,使用解码后的HTML文件名
md_filename = f"{html_basename}.md"
else:
# 替换标题中的非法字符
for char in invalid_chars:
title = title.replace(char, '_')
# 使用"标题_HTML原名.md"作为文件名
md_filename = f"{title}_{html_basename}.md"
# 截断过长的文件名
md_filename = truncate_filename(md_filename, MAX_FILENAME_LENGTH)
# 生成MD文件路径
md_path = os.path.join(os.path.dirname(html_path), md_filename)
# 检查文件是否存在,如果存在则添加计数器后缀
if os.path.exists(md_path):
counter = 1
base, ext = os.path.splitext(md_filename)
while os.path.exists(md_path):
# 截断基础文件名以容纳计数器
available_length = MAX_FILENAME_LENGTH - len(ext) - len(str(counter)) - 1
if available_length {md_path}")
return True
except Exception as e:
print(f"跳过文件 {html_path}: {str(e)}")
return False
def main():
# 提示用户输入文件夹路径
folder_path = input("请输入要处理的文件夹路径: ").strip()
# 检查路径是否存在
if not os.path.exists(folder_path):
print(f"错误: 路径 '{folder_path}' 不存在")
return
# 遍历文件夹及其子文件夹中的所有HTML文件
for root, _, files in os.walk(folder_path):
for file in files:
if file.lower().endswith('.html') or file.lower().endswith('.htm'):
html_path = os.path.join(root, file)
convert_html_to_md(html_path)
if __name__ == "__main__":
main()