python=3.13.5
scrapy=2.12.0
scrapyd=1.6.0
在使用scrapyd来部署自己的scrapy项目时会遇到logs和items的中文乱码问题, 对于中国地区的或者不习惯、不熟悉用英文来打log的人来说, 造成了分析日志的困难, 以下就是解决scrapyd的中文乱码的方案

d43b4f47676049fb99c1bba37aee968b.png (69.59 KB, 下载次数: 1)
下载附件
2025-10-15 16:50 上传

af0281b3ce1946739c3fa86437723a17.png (12.34 KB, 下载次数: 2)
下载附件
2025-10-15 16:50 上传
在发这个贴之前, 我已经在网上找遍了所有关于scrapyd的中文乱码的问题的修改, 里面不乏有修改settings, 叫你添加一个模板, 然后在里面建立一个iframe来获取log数据, 修改源码website.py文件等等
修改settings我尝试过n个方法都是失败告终, 而添加模板和修改website, scrapy里面就没有模板, 修改website给到的源码也是非常老旧版本, 对于目前根本没法使用.
我这次给到的方案也是修改源码, 跟别人不一样的可能就是版本比较新, 对于2025年目前的scrapy和scrapyd版本是可用的
对于日志(logs)添加了四个类来处理中文乱码问题:
LogDirectory
ProjectLogDirectory
SpiderLogDirectory
LogFile
[ol]
[/ol]
对于Items(items)同样添加了四个类来处理中文乱码问题:
ItemDirectory
ProjectItemDirectory
SpiderItemDirectory
ItemFile
[ol]
[/ol]
每个文件读取类(LogFile和ItemFile)在读取文件时都会尝试多种编码,确保中文能够正确显示。
此外,我们修改了Root类的初始化方法,将原本的File资源替换为我们自定义的资源类。这样,当访问日志或items时,就会走我们自定义的编码处理逻辑。
这样修改后,无论是日志文件还是items文件,其中的中文内容都能正确显示在浏览器中。
下面贴上我们的源码, 版本相同的直接替换即可
import os.path
import socket
from datetime import datetime, timedelta
from html import escape
from textwrap import dedent, indent
from urllib.parse import quote, urlsplit
from scrapy.utils.misc import load_object
from twisted.application.service import IServiceCollection
from twisted.python import filepath
from twisted.web import resource, static
from scrapyd.interfaces import IEggStorage, IPoller, ISpiderScheduler
from scrapyd.utils import local_items
# Use local DirectoryLister class.
class File(static.File):
def directoryListing(self):
path = self.path
names = self.listNames()
return DirectoryLister(path, names, self.contentTypes, self.contentEncodings, self.defaultType)
class DirectoryLister(static.DirectoryLister):
template = """
%(header)s
.even-dir { background-color: #efe0ef }
.even { background-color: #eee }
.odd-dir {background-color: #f0d0ef }
.odd { background-color: #dedede }
.icon { text-align: center }
.listing {
margin-left: auto;
margin-right: auto;
width: 50%%;
padding: 0.1em;
}
body { border: 0; padding: 0; margin: 0; background-color: #efefef; }
h1 {padding: 0.1em; background-color: #777; color: white; border-bottom: thin white dashed;}
%(header)s
Filename | Size | Last modified | Content type | Content encoding |
"""
linePattern = """[tr]
[td]%(text)s[/td]
[td]%(size)s[/td]
[td]%(modified)s[/td]
[td]%(type)s[/td]
[td]%(encoding)s[/td]
[/tr]
"""
def _getFilesAndDirectories(self, directory):
files = []
dirs = []
for path in directory:
if isinstance(path, bytes):
path = path.decode() # noqa: PLW2901 from Twisted
url = quote(path, "/")
escaped_path = escape(path)
child_path = filepath.FilePath(self.path).child(path)
modified = datetime.fromtimestamp(child_path.getModificationTime()).strftime("%Y-%m-%d %H:%M") # NEW
if child_path.isdir():
dirs.append(
{
"text": escaped_path + "/",
"href": url + "/",
"size": "",
"type": "[Directory]",
"encoding": "",
"modified": modified, # NEW
}
)
else:
mimetype, encoding = static.getTypeAndEncoding(
path, self.contentTypes, self.contentEncodings, self.defaultType
)
try:
size = child_path.getsize()
except OSError:
continue
files.append(
{
"text": escaped_path,
"href": url,
"type": f"[{mimetype}]",
"encoding": (encoding and f"[{encoding}]" or ""),
"size": static.formatFileSize(size),
"modified": modified, # NEW
}
)
return dirs, files
def _get_file_url(base, directory, job, extension):
if os.path.exists(os.path.join(directory, job.project, job.spider, f"{job.job}.{extension}")):
return f"/{base}/{job.project}/{job.spider}/{job.job}.{extension}"
return None
class LogFile(resource.Resource):
"""处理单个日志文件的读取,支持中文编码"""
def __init__(self, root, project, spider, job):
resource.Resource.__init__(self)
self.root = root
self.project = project
self.spider = spider
self.job = job
def render_GET(self, txrequest):
log_path = os.path.join(self.root.logs_dir, self.project, self.spider, f"{self.job}.log")
if os.path.exists(log_path):
# 尝试多种编码读取日志文件
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
content = None
for encoding in encodings:
try:
with open(log_path, 'r', encoding=encoding) as f:
content = f.read()
break
except (UnicodeDecodeError, LookupError):
continue
# 如果所有编码都失败,使用替换策略
if content is None:
with open(log_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# 设置正确的 Content-Type
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/plain; charset=utf-8']
)
return content.encode('utf-8')
else:
return f"Log not found: {log_path}".encode('utf-8')
class ItemFile(resource.Resource):
"""处理单个Items文件的读取,支持中文编码"""
def __init__(self, root, project, spider, job):
resource.Resource.__init__(self)
self.root = root
self.project = project
self.spider = spider
self.job = job
def render_GET(self, txrequest):
item_path = os.path.join(self.root.items_dir, self.project, self.spider, f"{self.job}.jl")
if os.path.exists(item_path):
# 尝试多种编码读取Items文件
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
content = None
for encoding in encodings:
try:
with open(item_path, 'r', encoding=encoding) as f:
content = f.read()
break
except (UnicodeDecodeError, LookupError):
continue
# 如果所有编码都失败,使用替换策略
if content is None:
with open(item_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# 设置正确的 Content-Type
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/plain; charset=utf-8']
)
return content.encode('utf-8')
else:
return f"Item file not found: {item_path}".encode('utf-8')
class LogDirectory(resource.Resource):
"""处理日志目录结构"""
def __init__(self, root):
resource.Resource.__init__(self)
self.root = root
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回目录列表
if path == "":
return self
# 检查这个路径是否是一个项目目录
project_path = os.path.join(self.root.logs_dir, path)
if os.path.isdir(project_path):
return ProjectLogDirectory(self.root, path)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回日志目录的列表
if not os.path.exists(self.root.logs_dir):
return "Logs directory does not exist".encode('utf-8')
projects = []
for item in os.listdir(self.root.logs_dir):
item_path = os.path.join(self.root.logs_dir, item)
if os.path.isdir(item_path):
projects.append(f'
if projects:
content = f"Projects{''.join(projects)}"
else:
content = "No projects found"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class ProjectLogDirectory(resource.Resource):
"""处理项目下的日志目录"""
def __init__(self, root, project):
resource.Resource.__init__(self)
self.root = root
self.project = project
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回蜘蛛列表
if path == "":
return self
# 检查这个路径是否是一个蜘蛛目录
spider_path = os.path.join(self.root.logs_dir, self.project, path)
if os.path.isdir(spider_path):
return SpiderLogDirectory(self.root, self.project, path)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回项目下的蜘蛛列表
project_path = os.path.join(self.root.logs_dir, self.project)
if not os.path.exists(project_path):
return f"Project {self.project} does not exist".encode('utf-8')
spiders = []
for item in os.listdir(project_path):
item_path = os.path.join(project_path, item)
if os.path.isdir(item_path):
spiders.append(f'
if spiders:
content = f"Spiders in {self.project}{''.join(spiders)}"
else:
content = f"No spiders found in {self.project}"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class SpiderLogDirectory(resource.Resource):
"""处理蜘蛛下的日志文件"""
def __init__(self, root, project, spider):
resource.Resource.__init__(self)
self.root = root
self.project = project
self.spider = spider
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回日志文件列表
if path == "":
return self
# 检查这个路径是否是一个日志文件
if path.endswith('.log'):
job = path[:-4] # 移除 .log 扩展名
log_path = os.path.join(self.root.logs_dir, self.project, self.spider, path)
if os.path.isfile(log_path):
return LogFile(self.root, self.project, self.spider, job)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回蜘蛛下的日志文件列表
spider_path = os.path.join(self.root.logs_dir, self.project, self.spider)
if not os.path.exists(spider_path):
return f"Spider {self.spider} does not exist in project {self.project}".encode('utf-8')
logs = []
for item in os.listdir(spider_path):
item_path = os.path.join(spider_path, item)
if os.path.isfile(item_path) and item.endswith('.log'):
logs.append(f'
if logs:
content = f"Logs for {self.project}/{self.spider}{''.join(logs)}"
else:
content = f"No logs found for {self.project}/{self.spider}"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class ItemDirectory(resource.Resource):
"""处理Items目录结构"""
def __init__(self, root):
resource.Resource.__init__(self)
self.root = root
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回目录列表
if path == "":
return self
# 检查这个路径是否是一个项目目录
project_path = os.path.join(self.root.items_dir, path)
if os.path.isdir(project_path):
return ProjectItemDirectory(self.root, path)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回Items目录的列表
if not os.path.exists(self.root.items_dir):
return "Items directory does not exist".encode('utf-8')
projects = []
for item in os.listdir(self.root.items_dir):
item_path = os.path.join(self.root.items_dir, item)
if os.path.isdir(item_path):
projects.append(f'
if projects:
content = f"Projects{''.join(projects)}"
else:
content = "No projects found"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class ProjectItemDirectory(resource.Resource):
"""处理项目下的Items目录"""
def __init__(self, root, project):
resource.Resource.__init__(self)
self.root = root
self.project = project
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回蜘蛛列表
if path == "":
return self
# 检查这个路径是否是一个蜘蛛目录
spider_path = os.path.join(self.root.items_dir, self.project, path)
if os.path.isdir(spider_path):
return SpiderItemDirectory(self.root, self.project, path)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回项目下的蜘蛛列表
project_path = os.path.join(self.root.items_dir, self.project)
if not os.path.exists(project_path):
return f"Project {self.project} does not exist".encode('utf-8')
spiders = []
for item in os.listdir(project_path):
item_path = os.path.join(project_path, item)
if os.path.isdir(item_path):
spiders.append(f'
if spiders:
content = f"Spiders in {self.project}{''.join(spiders)}"
else:
content = f"No spiders found in {self.project}"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class SpiderItemDirectory(resource.Resource):
"""处理蜘蛛下的Items文件"""
def __init__(self, root, project, spider):
resource.Resource.__init__(self)
self.root = root
self.project = project
self.spider = spider
def getChild(self, path, request):
# 将路径转换为字符串
if isinstance(path, bytes):
path = path.decode('utf-8')
# 如果路径为空,返回Items文件列表
if path == "":
return self
# 检查这个路径是否是一个Items文件
if path.endswith('.jl'):
job = path[:-3] # 移除 .jl 扩展名
item_path = os.path.join(self.root.items_dir, self.project, self.spider, path)
if os.path.isfile(item_path):
return ItemFile(self.root, self.project, self.spider, job)
return resource.NoResource()
def render_GET(self, txrequest):
# 返回蜘蛛下的Items文件列表
spider_path = os.path.join(self.root.items_dir, self.project, self.spider)
if not os.path.exists(spider_path):
return f"Spider {self.spider} does not exist in project {self.project}".encode('utf-8')
items = []
for item in os.listdir(spider_path):
item_path = os.path.join(spider_path, item)
if os.path.isfile(item_path) and item.endswith('.jl'):
items.append(f'
if items:
content = f"Items for {self.project}/{self.spider}{''.join(items)}"
else:
content = f"No items found for {self.project}/{self.spider}"
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/html; charset=utf-8']
)
return content.encode('utf-8')
class Log(resource.Resource):
"""处理日志文件读取,支持中文编码"""
def __init__(self, root):
resource.Resource.__init__(self)
self.root = root
def render_GET(self, txrequest):
# 获取 URL 路径部分
path = txrequest.path.decode('utf-8')
# 解析路径,格式为 /logs/project/spider/job.log
parts = path.split('/')
if len(parts) >= 5 and parts[0] == '' and parts[1] == 'logs':
project = parts[2]
spider = parts[3]
job_with_extension = parts[4]
# 移除 .log 扩展名
if job_with_extension.endswith('.log'):
job = job_with_extension[:-4]
else:
job = job_with_extension
log_path = os.path.join(self.root.logs_dir, project, spider, f"{job}.log")
if os.path.exists(log_path):
# 尝试多种编码读取日志文件
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
content = None
for encoding in encodings:
try:
with open(log_path, 'r', encoding=encoding) as f:
content = f.read()
break
except (UnicodeDecodeError, LookupError):
continue
# 如果所有编码都失败,使用替换策略
if content is None:
with open(log_path, 'r', encoding='utf-8', errors='replace') as f:
content = f.read()
# 设置正确的 Content-Type
txrequest.responseHeaders.setRawHeaders(
b'content-type',
[b'text/plain; charset=utf-8']
)
return content.encode('utf-8')
return "Log not found".encode('utf-8')
class Root(resource.Resource):
def __init__(self, config, app):
super().__init__()
self.app = app
self.logs_dir = config.get("logs_dir", "logs")
self.items_dir = config.get("items_dir", "")
self.debug = config.getboolean("debug", False)
self.runner = config.get("runner", "scrapyd.runner")
self.prefix_header = config.get("prefix_header", "x-forwarded-prefix")
self.local_items = local_items(self.items_dir, urlsplit(self.items_dir))
self.node_name = config.get("node_name", socket.gethostname())
# 使用自定义的日志目录处理
if self.logs_dir:
self.putChild(b"logs", LogDirectory(self))
# 使用自定义的Items目录处理
if self.local_items:
self.putChild(b"items", ItemDirectory(self))
for service_name, service_path in config.items("services", default=[]):
if service_path:
service_cls = load_object(service_path)
self.putChild(service_name.encode(), service_cls(self))
# Add web UI last, since its behavior can depend on others' presence.
self.putChild(b"", Home(self))
self.putChild(b"jobs", Jobs(self))
def update_projects(self):
self.poller.update_projects()
self.scheduler.update_projects()
def get_log_url(self, job):
return _get_file_url("logs", self.logs_dir, job, "log")
def get_item_url(self, job):
if self.local_items:
return _get_file_url("items", self.items_dir, job, "jl")
return None
@property
def launcher(self):
return IServiceCollection(self.app, self.app).getServiceNamed("launcher")
@property
def scheduler(self):
return self.app.getComponent(ISpiderScheduler)
@property
def eggstorage(self):
return self.app.getComponent(IEggStorage)
@property
def poller(self):
return self.app.getComponent(IPoller)
class PrefixHeaderMixin:
def get_base_path(self, txrequest):
return txrequest.getHeader(self.root.prefix_header) or ""
class Home(PrefixHeaderMixin, resource.Resource):
def __init__(self, root):
super().__init__()
self.root = root
def prepare_projects(self):
if projects := self.root.scheduler.list_projects():
lis = "\n".join(f"
return f"Scrapy projects:
\n\n{indent(lis, ' ')}\n"
return "No Scrapy projects yet.
"
def render_GET(self, txrequest):
base_path = self.get_base_path(txrequest)
content = dedent(
f"""\
Scrapyd
body {{ font-family: system-ui, sans-serif; }}
Scrapyd
{f'
{indent(self.prepare_projects(), " ")}
This web UI is for monitoring only. To upload projects and schedule crawls, use the API.
For example, using curl:
curl http://localhost:6800/schedule.json -d project=default -d spider=somespider
See the Scrapyd documentation for details.
"""
)
content = content.encode()
txrequest.setHeader("Content-Type", "text/html; charset=utf-8")
txrequest.setHeader("Content-Length", str(len(content)))
return content
def no_microseconds(timelike):
# microsecond for datetime, microseconds for timedelta.
ms = timelike.microsecond if hasattr(timelike, "microsecond") else timelike.microseconds
return timelike - timedelta(microseconds=ms)
class Jobs(PrefixHeaderMixin, resource.Resource):
def __init__(self, root):
super().__init__()
self.root = root
self.headers = [
"Project",
"Spider",
"Job",
"PID",
"Start",
"Runtime",
"Finish",
"Log",
]
# Hide the Items column if items_dir isn't local.
if self.root.local_items:
self.headers.append("Items")
# Hide the Cancel column if no cancel.json webservice.
if b"cancel.json" in self.root.children:
self.headers.append("Cancel")
def cancel_button(self, project, job):
return dedent(
f"""
"""
)
def html_log_url(self, job):
print("job", job)
if url := self.root.get_log_url(job):
return f'Log'
return None
def html_item_url(self, job):
if url := self.root.get_item_url(job):
return f'Items'
return None
def prepare_headers(self):
ths = "\n".join(f"[td]{header}[/td]" for header in self.headers)
return f"[tr]\n{indent(ths, ' ')}\n[/tr]"
def prepare_row(self, row):
tds = "\n".join(f"[td]{'' if row.get(header) is None else row[header]}[/td]" for header in self.headers)
return f"[tr]\n{indent(tds, ' ')}\n[/tr]"
def prepare_pending(self):
return "\n".join(
self.prepare_row(
{
"Project": escape(project),
"Spider": escape(message["name"]),
"Job": escape(message["_job"]),
"Cancel": self.cancel_button(project, message["_job"]),
}
)
for project, queue in self.root.poller.queues.items()
for message in queue.list()
)
def prepare_running(self):
return "\n".join(
self.prepare_row(
{
"Project": escape(process.project),
"Spider": escape(process.spider),
"Job": escape(process.job),
"PID": process.pid,
"Start": no_microseconds(process.start_time),
"Runtime": no_microseconds(datetime.now() - process.start_time),
"Log": self.html_log_url(process),
"Items": self.html_item_url(process),
"Cancel": self.cancel_button(process.project, process.job),
}
)
for process in self.root.launcher.processes.values()
)
def prepare_finished(self):
return "\n".join(
self.prepare_row(
{
"Project": escape(job.project),
"Spider": escape(job.spider),
"Job": escape(job.job),
"Start": no_microseconds(job.start_time),
"Runtime": no_microseconds(job.end_time - job.start_time),
"Finish": no_microseconds(job.end_time),
"Log": self.html_log_url(job),
"Items": self.html_item_url(job),
}
)
for job in self.root.launcher.finished
)
def render_GET(self, txrequest):
self.base_path = self.get_base_path(txrequest)
content = dedent(
f"""\
Scrapyd
body {{ font-family: system-ui, sans-serif; }}
table {{ border-collapse: collapse; }}
th, td {{ border-style: solid; border-width: 1px; }}
tbody > tr:first-child {{ background-color: #eee; }}
th, td {{ padding: .5rem; }}
td:nth-child(2), td:nth-child(3) {{ word-break: break-all; }}
Jobs
Go up
{indent(self.prepare_headers(), " ")}
Pending
{indent(self.prepare_pending(), " ")}
Running
{indent(self.prepare_running(), " ")}
Finished
{indent(self.prepare_finished(), " ")}
"""
).encode()
txrequest.setHeader("Content-Type", "text/html; charset=utf-8")
txrequest.setHeader("Content-Length", str(len(content)))
return content

image.png (67.96 KB, 下载次数: 2)
下载附件
2025-10-15 16:51 上传