给单位建设网站,简述网页的制作流程,网站关键词密度怎么计算的,h5制作软件紫色logo整站下载保存为mhtml 代码 MHTML格式具有独特的优点#xff0c;它可以完整保留原始网页的所有布局元素以及嵌入图片#xff0c;无需外部依赖即可呈现原始网页内容#xff0c;增强了可读性和便捷性。下文将展示如何运用自动化技术#xff0c;从一个网站的首页出发#xff0… 整站下载保存为mhtml 代码 MHTML格式具有独特的优点它可以完整保留原始网页的所有布局元素以及嵌入图片无需外部依赖即可呈现原始网页内容增强了可读性和便捷性。下文将展示如何运用自动化技术从一个网站的首页出发采用递归爬取的方式遍历整个站点并将抓取到的各个页面悉数保存为MHTML格式同时按照URL的层级结构将这些页面对应地存储到相应的目录层级中。 代码
# coding: utf-8
import requests
import re
from bs4 import BeautifulSoup
import codecs
import json
import time
import datetime
import os
import sys
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import Bydef replace_url(name):for i in [S:, -, , , /, \\, |, :, *, ?, ,·,,”,\,”]:name name.replace(i, )return namedef replace_dir(name):for i in [, , |, :, *,·,,”,”,“, ,..]:name name.replace(i, )return namedef get_dirname(path):if os.path.dirname(path) :return os.path.basename(path)return os.path.dirname(path) def get_href_recursive(loginreqsession,Todo,Finished,Files,black_list): 递归获取所有的链接,采用set去重if len(Todo)0:returnfor url in list(Todo):if url in Finished or url in black_list:Todo.remove(url)continuepage_html_text loginreqsession.get(url).content.decode(utf-8)soup BeautifulSoup(page_html_text, html.parser)hrefssoup.find_all(a)for s in hrefs: hrefs[href]excludes[http,#,.,javascript,u下载]skipFalsefor prefix in excludes:if href.startswith(prefix):skipTruebreakif skip:continueprint(href)Todo.add({}{}.format(url,href))Finished.add(url)Todo.remove(url)get_href_recursive(loginreqsession,Todo,Finished,Files,black_list)def create_href_list(home_page):loginreqsession requests.session()Finishedset()Todoset()Files[]black_listset()Todo.add(home_page)loginreqsession requests.session() get_href_recursive(loginreqsession,Todo,Finished,Files,black_list)with open(list.txt,w) as f:for link in Finished:f.write({}\n.format(link))def save_page_as_mhtml(home_page,driver, wait, url):pagenameurl.replace(home_page,home).strip()[:-1]output_path{}.mhtml.format(pagename)output_pathos.path.join(replace_dir(os.path.dirname(output_path)).strip(),replace_url(os.path.basename(output_path)).strip())base_dir get_dirname(output_path)if not os.path.exists(base_dir):os.makedirs(base_dir)if os.path.exists(output_path):return True#等待页面加载完成 driver.set_page_load_timeout(120)driver.set_script_timeout(120)try:driver.get(url)except:print(timeout:,url) return True time.sleep(10) #非必要res driver.execute_cdp_cmd(Page.captureSnapshot, {})try:with open(output_path, w, newline) as f:f.write(res[data])except:return Truereturn Truedef save_pages(home_page):options webdriver.ChromeOptions()driver webdriver.Chrome()wait WebDriverWait(driver, 10)with open(list.txt,r) as f:for link in tqdm(f.readlines()):save_page_as_mhtml(home_page,driver, wait,link)def main():home_pagehttp://192.168.1.100 create_href_list(home_page)save_pages(home_page)if __name__ __main__:main()