做网站域名要自己注册吗,外贸做那种网站,如何设计大气的网站,哪个网站注册域名好我们将BASE_URL 设置为 https://oi-wiki.org/ 后脚本就会自动开始抓取该url及其子页面的所有内容#xff0c;并将统一子页面的放在一个文件夹中
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import pd…我们将BASE_URL 设置为 https://oi-wiki.org/ 后脚本就会自动开始抓取该url及其子页面的所有内容并将统一子页面的放在一个文件夹中
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import pdfkit
from urllib3.exceptions import InsecureRequestWarning# 禁用SSL警告
requests.packages.urllib3.disable_warnings(categoryInsecureRequestWarning)# 配置wkhtmltopdf路径
config pdfkit.configuration(wkhtmltopdf/usr/local/bin/wkhtmltopdf)BASE_URL https://oi-wiki.org/
DOMAIN urlparse(BASE_URL).netlocheaders {User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36,Accept-Language: zh-CN,zh;q0.9
}visited set()
queue [BASE_URL]def is_valid_url(url):parsed urlparse(url)return (parsed.netloc DOMAIN andnot parsed.fragment andnot url.endswith((.zip, .pdf, .jpg, .png)))def extract_links(html, base_url):soup BeautifulSoup(html, html.parser)links []for a in soup.find_all(a, hrefTrue):full_url urljoin(base_url, a[href]).split(#)[0]if is_valid_url(full_url) and full_url not in visited:links.append(full_url)visited.add(full_url)return linksdef fetch_page(url):try:print(f[*] 抓取中: {url})res requests.get(url, headersheaders, verifyFalse, timeout30)res.encoding utf-8return res.textexcept Exception as e:print(f[!] 抓取失败: {url} - {str(e)})return Nonedef clean_html(html, url):soup BeautifulSoup(html, html.parser)# 移除所有顶部导航和侧边栏相关元素for tag in soup.select(.navbar, .page-toc, .sidebar, footer, .giscus, .page-footer, .page-actions):tag.decompose()# 仅保留主内容区域main_content soup.select_one(main article) or soup.select_one(article) or soup# 修正资源路径for tag in main_content.find_all([img, a]):for attr in [href, src]:if tag.has_attr(attr):tag[attr] urljoin(url, tag[attr])# 获取有效标题使用最后一个有效路径段title_parts urlparse(url).path.strip(/).split(/)title title_parts[-1].replace(-, ).title() if title_parts else Documentreturn f!DOCTYPE htmlhtmlheadmeta charsetutf-8title{title}/titlestylebody {{ font-family: Noto Sans CJK SC, Arial, sans-serif;line-height: 1.6;margin: 2em;}}/* 保持原有样式 *//style/headbodyh1{title}/h1{main_content}/body/htmldef save_as_pdf(html, url):parsed urlparse(url)path_segments [seg for seg in parsed.path.strip(/).split(/) if seg]if len(path_segments) 1:dir_path os.path.join(output, *path_segments[:-1])filename f{path_segments[-1]}.pdfelse:dir_path outputfilename index.pdfos.makedirs(dir_path, exist_okTrue)full_path os.path.join(dir_path, filename)try:pdfkit.from_string(html, full_path, configurationconfig, options{encoding: UTF-8,enable-local-file-access: None,quiet: # 隐藏控制台输出})print(f[√] 已保存: {full_path})except Exception as e:print(f[!] PDF生成失败: {full_path} - {str(e)})def crawl():while queue:current_url queue.pop(0)html fetch_page(current_url)if not html:continuenew_links extract_links(html, current_url)queue.extend(new_links)cleaned_html clean_html(html, current_url)save_as_pdf(cleaned_html, current_url)if __name__ __main__:print( 启动爬虫目标站点:, BASE_URL)visited.add(BASE_URL)crawl()print(✅ 所有内容已保存至 output/ 目录)