C:\Users\ASUS>python keledge.py
2023-02-27 13:38:02,153 [INFO ] ====== WebDriver manager ======
2023-02-27 13:38:03,365 [INFO ] Get LATEST chromedriver version for google-chrome 110.0.5481
2023-02-27 13:38:04,737 [INFO ] Driver [C:\Users\ASUS\.wdm\drivers\chromedriver\win32\110.0.5481\chromedriver.exe] found in cache
C:\Users\ASUS\keledge.py:105: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
self.driver = webdriver.Chrome(
DevTools listening on ws://127.0.0.1:3623/devtools/browser/c386fccf-dda2-4384-a749-ff790a312aad
C:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\http\cookiejar.py:2083: UserWarning: http.cookiejar bug!
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\http\cookiejar.py", line 2044, in _really_load
domain, domain_specified, path, secure, expires, name, value = \
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 7)
_warn_unhandled_exception()
Traceback (most recent call last):
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\http\cookiejar.py", line 2044, in _really_load
domain, domain_specified, path, secure, expires, name, value = \
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 7)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\ASUS\keledge.py", line 363, in
a.main()
File "C:\Users\ASUS\keledge.py", line 237, in main
self.load_cookies(self.cookie_path)
File "C:\Users\ASUS\keledge.py", line 114, in load_cookies
cj.load(filename=path, ignore_discard=True, ignore_expires=True)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\http\cookiejar.py", line 1802, in load
self._really_load(f, filename, ignore_discard, ignore_expires)
File "C:\Users\ASUS\AppData\Local\Programs\Python\Python311\Lib\http\cookiejar.py", line 2084, in _really_load
raise LoadError("invalid Netscape format cookies file %r: %r" %
http.cookiejar.LoadError: invalid Netscape format cookies file 'C:\\Users\\ASUS\\cookies.txt': 'www.keledge.com\tFALSE\t/wrap/details\tFALSE\t1710769592\tp_h5_u\t78C12B56-3EEB-4B0F-AF06-13B5CC4F8DBA\t42'
附运行代码:
# kelege ePub to HTML
# author: Wenbin FAN
# date: Jul. 2022
# mail: [email protected]
#
import os
import re
import requests
import requests.utils
import json
import http.cookiejar as cookiejar
import sqlite3
import time
import numpy as np
from datetime import datetime
from tqdm import tqdm
import secrets # generate tokens for toc
import logging
import http.client as httplib
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys # press page down
# from selenium.webdriver.common.proxy import Proxy, ProxyType
from bs4 import BeautifulSoup as bs
from lxml import etree
from urllib.parse import urlsplit
import urllib.request
class kezhi_epub():
def __init__(self, url,
root=r'C:\Users\ASUS',
cookie_path=r'C:\Users\ASUS\cookies.txt',
chrome_exe_path=r'C:\Users\ASUS\chromedriver.exe'):
self.root = root
self.cookie_path = cookie_path
self.chrome_exe_path = chrome_exe_path
self.init_logger()
self.init_browser()
self.url = url
self.load_wait = 50
self.max_Ntries = 1000
self.image_folder_name = 'images'
self.toc_html_name = 'TOC.html'
self.main_html_name = 'main.html'
self.index_html_name = 'index.html'
self.toc_target = 'toc'
self.main_target = 'text'
return
def init_logger(self):
logFormatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s] %(message)s")
rootLogger = logging.getLogger()
rootLogger.setLevel(logging.INFO)
fileHandler = logging.FileHandler("{}".format(os.path.join(self.root,'fan.log')) )
fileHandler.setFormatter(logFormatter)
rootLogger.addHandler(fileHandler)
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
rootLogger.addHandler(consoleHandler)
return
def init_browser(self):
caps = DesiredCapabilities.CHROME
caps['goog:loggingPrefs'] = {'performance': 'ALL'}
options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')
options.add_argument("--window-size=1920,1080")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
options.add_argument(
'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
options.add_argument("disable-blink-features=AutomationControlled")
prefs = {"profile.managed_default_content_settings.images": 2} # disable images
options.add_experimental_option("prefs", prefs)
# prox = Proxy()
# prox.proxy_type = ProxyType.MANUAL
# prox.http_proxy = "127.0.0.1:80"
# capabilities = webdriver.DesiredCapabilities.CHROME
# prox.add_to_capabilities(capabilities)
s = Service(self.chrome_exe_path)
self.driver = webdriver.Chrome(
ChromeDriverManager().install(),
# service=s,
desired_capabilities=caps,
options=options)
return
def load_cookies(self, path):
cj = cookiejar.MozillaCookieJar()
cj.load(filename=path, ignore_discard=True, ignore_expires=True)
for cookie in cj:
cookie_dict = {'domain': cookie.domain, 'name': cookie.name, 'value': cookie.value, 'secure': cookie.secure}
if cookie.expires:
cookie_dict['expiry'] = cookie.expires
if cookie.path_specified:
cookie_dict['path'] = cookie.path
self.driver.add_cookie(cookie_dict)
return
def download_image(self, url):
big_img_url = urlsplit(url)._replace(query=None).geturl()
big_img_url = str(big_img_url)
img_name = big_img_url.split('/')[-1]
# urllib.request.urlretrieve(big_img_url, os.path.join(self.img_folder, img_name))
self.image_link_file.write(f'{big_img_url}\n dir={self.img_folder}\n out={img_name}\n')
return img_name
def parse_chapter(self, soup):
# remove random characters
for div in soup.find_all('span', {'class': 'random'}):
div.decompose()
# unwarp all span
for s in soup.find_all('span'):
s.unwrap()
# download all image
for img in soup('img'):
img_url = img['data-src']
img_name = self.download_image(img_url)
img['src'] = f'./{self.image_folder_name}/{img_name}'
# delete extra attributes
del img['data-src']
del img['isloaded']
del img['Kezi_Zhang'] # test the attribute that does not exists
# image in svg
for img in soup('image'):
img_url = img['xlink:href']
img_name = self.download_image(img_url)
img['xlink:href'] = f'./{self.image_folder_name}/{img_name}'
# restore all href
# "chap01.html#TAGTAGTAG" -> "#TAGTAGTAG"
for xref in soup.find_all('a', attrs={'href': True}):
if '#' in xref['href'] and 'tp:' not in xref['href']:
xref['href'] = '#' + xref['href'].replace('#', '').replace('.', '')
# get headings
for heading in soup.find_all(re.compile("^h[1-6]$")) :
level_text = heading.name
try:
hid = heading['id']
hid += secrets.token_hex(8)
except KeyError:
logging.error(f'There is no ID for title: {level_text} - {heading.text}')
hid = secrets.token_hex(16) # len = 32
del heading['id']
# internal wrap (... strange name XD
toc_href = f'{hid}-TOC'
head_text = heading.text
heading.wrap(
bs().new_tag('a', attrs=
{'href': f'{self.toc_html_name}#{toc_href}',
'target': self.toc_target,
'id': hid,
},
))
heading.parent.wrap(bs().new_tag(level_text))
heading.unwrap()
self.toc_html_file.write(
f''
f'[url=]'
f'{head_text}[/url]
\n'
)
self.toc_html_file.flush()
for heading in soup.find_all('div', ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
level_text = heading.get('class')[0]
try:
hid = heading['id']
hid += secrets.token_hex(8)
except KeyError:
logging.error(f'There is no ID for title: {level_text} - {heading.text}')
hid = secrets.token_hex(16) # len = 32
del heading['id']
heading.name = level_text
# internal wrap (... strange name XD
toc_href = f'{hid}-TOC'
head_text = heading.text
heading.wrap(
bs().new_tag('a', attrs=
{'href': f'{self.toc_html_name}#{toc_href}',
'target': self.toc_target,
'id': hid,
},
))
heading.parent.wrap(bs().new_tag(level_text))
heading.unwrap()
self.toc_html_file.write(
f''
f'[url=]'
f'{head_text}[/url]
\n'
)
self.toc_html_file.flush()
self.main_html_file.write(str(soup))
self.main_html_file.write('\n')
self.main_html_file.flush()
return
def main(self):
self.driver.get('https://www.keledge.com/login') # browse host before setting cookie
# add cookies
self.load_cookies(self.cookie_path)
self.driver.get(self.url)
element = WebDriverWait(self.driver, self.load_wait).until(
EC.presence_of_element_located((By.CLASS_NAME, "epub-main"))
)
time.sleep(3) # sleep to wait the title loaded
# get book title from html title
title = self.driver.title
self.title = re.sub(r'[^\w\-_\. ]', '_', title)
# make book folder
self.book_folder = os.path.join(self.root, self.title)
if not os.path.exists(self.book_folder):
os.makedirs(self.book_folder)
self.img_folder = os.path.join(self.book_folder, self.image_folder_name)
if not os.path.exists(self.img_folder):
os.mkdir(self.img_folder)
# index file
index_html_file = open(os.path.join(self.book_folder, self.index_html_name), 'w', encoding='utf-8')
index_html_contents = f'''
{self.title}
'''
index_html_file.write(index_html_contents)
index_html_file.close()
# toc html file
self.toc_html_file = open(os.path.join(self.book_folder, self.toc_html_name), 'w', encoding='utf-8')
toc_html_contents = '''
TOC
'''
self.toc_html_file.write(toc_html_contents)
html_path = os.path.join(self.book_folder, self.main_html_name)
self.main_html_file = open(html_path, 'w', encoding='utf-8')
# write html basics: head, title, css
self.main_html_file.write('\n\nMain text\n')
self.main_html_file.write('\n')
self.main_html_file.write('\n')
self.main_html_file.write('\n')
self.main_html_file.write('\n\n')
# # turn off "guide" mask on the whole screen | 关闭覆盖在整个页面上的导航
# self.driver.find_element(By.XPATH, '//*[@id="epub-reader"]/div[3]/div/div/ul/li[11]/div').click()
# a file that stores all image links. we will download all images after grabbing the main text
# the format is aria2c. below is the format.
# 1 |
# 2 | dir=
# 3 | out=
# 4 |
# 5 | dir=
# ...
self.image_link_file = open(os.path.join(self.book_folder, 'image.list'), 'w', encoding='utf-8')
# start when the progress presents
WebDriverWait(self.driver, self.load_wait).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="epub-reader"]/div[3]/div/div/div'))
)
chapter_list = []
max_tries = 0
last_progress = 0.0
while max_tries div'):
chapter_name = chapter_soup.get('id', 'FALSE_CHAPTER_NAME')
if chapter_name != 'FALSE_CHAPTER_NAME':
if chapter_name not in chapter_list:
chapter_list.append(chapter_name)
logging.info(f'Now last 3 chapters of {len(chapter_list)}: {chapter_list[max(-len(chapter_list),-3):]}')
self.parse_chapter(chapter_soup)
logging.info(f'Parse done! Chapter name: {chapter_name}')
time.sleep(self.load_wait * 0.0) # sleep little when there is new chapter
elif chapter_soup['class'] == ['read-end']:
continue
elif chapter_soup['class'] == ['scroll-loading']:
continue
else:
print('Wrong chapter name found! Below is the full div. ')
print(chapter_soup)
# scroll window height
# # self.driver.find_element(By.XPATH, '//*[@id="app"]').send_keys(Keys.END)
# # # Above code return: element not interactable
self.driver.execute_script('document.getElementsByClassName("epub-single-view")[0].scrollBy(0, document.body.scrollHeight*1.2)')
# progress | 阅读进度
prog_elem = self.driver.find_element(By.XPATH, '//*[@id="epub-reader"]/div[3]/div/div/div').text.replace('%', '')
logging.info(f'Read progress: {prog_elem}')
if float(prog_elem) >= 100:
max_tries += 100 # a
# to scroll more
if float(prog_elem) - last_progress \n\n')
self.main_html_file.close()
self.toc_html_file.close()
return
if __name__ == '__main__':
url ='https://www.keledge.com/epubReader?url=https%3A%2F%2Fgateway.keledge.com%2Ftransfer%2Faqr%2Fauthorize&contentexternalid=P00003-01-56743-Epub&id=976478808502702080&organizationExternalId=18-548efce3f197442eadefd9764075e7b9&objectType=104&process=&tocValue='
cookie_path = r'C:\Users\ASUS\cookies.txt'
chrome_driver_path = r'C:\Users\ASUS\chromedriver.exe'
a = kezhi_epub(url, cookie_path=cookie_path, chrome_exe_path=chrome_driver_path)
a.main()