wordpress 文章 移除侧边栏,网站群站优化,永兴集团网站,网站用户建设的设计与实现之前实现了对大学排数据爬取#xff1a;爬虫2_2019年549所中国大学排名. 近期复现代码#xff0c;发现原网站升级#xff0c;在翻页时#xff0c;发现URL不改变#xff0c;修改代码#xff0c;使用网页自动化工具selenium实现对该类网站数据获取。
#-*- coding: UTF-8 -…之前实现了对大学排数据爬取爬虫2_2019年549所中国大学排名. 近期复现代码发现原网站升级在翻页时发现URL不改变修改代码使用网页自动化工具selenium实现对该类网站数据获取。
#-*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import bs4
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains # 鼠标操作
from selenium.webdriver.common.by import By
import timedef get_info(soup, _type, element, paramNone):if _type find:if param is not None:params dict([param.split()])res soup.find(element, **params)else:res soup.find(element)if res is not None:res res.string.replace( , ).replace(\n, )else:res Noneif _type find_all:if param is not None:params dict([param.split()])res soup.find_all(element, **params)else:res soup.find_all(element) return resdef fillUnivList(html):soup BeautifulSoup(html, html.parser)for tr in soup.find(tbody).children:if isinstance(tr, bs4.element.Tag): # 如果为Tag类型td_list tr.find_all(td)排名top get_info(td_list[0], find, div, class_ranking)logologo td_list[1].find(img)[src]中文名/英文名university_list get_info(td_list[1], find_all, a)ch_name university_list[0].string.replace(\n, ).replace(\t, ).strip( )en_name university_list[1].string.replace(\n, ).strip( )学校标签tags get_info(td_list[1], find, p)学校地址area td_list[2].text.replace(\n, ).strip( )学校行业main td_list[3].text.replace(\n, ).strip( )综合分数score td_list[4].text.replace(\n, ).strip( )办学层次layer td_list[5].text.replace(\n, ).strip( )print({:3}|{}|{:80}|{}|{}|{}|{:6}|{:5}|{}.format(top, ch_name.ljust(14, \u3000),en_name, tags.ljust(12, \u3000), area.ljust(4, \u3000), main.ljust(4, \u3000), score, layer, logo))def action_run(driver, actions, info, byBy.ID, time_num1):while 1:config_facesearch driver.find_element(byby, valueinfo)if config_facesearch.is_displayed():actions.move_to_element(config_facesearch).click().perform()time.sleep(time_num) breakelse:print(%s is not find, watting... % (info))time.sleep(1)if __name__ __main__:url https://www.shanghairanking.cn/rankings/bcur/2023start time.strftime(%H:%M:%S, time.localtime())driver webdriver.Firefox()# driver webdriver.Chrome()driver.maximize_window()driver.get(url)time.sleep(2)鼠标操作actions ActionChains(driver)for i in range(20):html driver.page_sourcefillUnivList(html)driver.execute_script(window.scrollTo(0, document.body.scrollHeight);) # 滚动至底部action_run(driver, actions, infoli[title下一页], byBy.CSS_SELECTOR)end time.strftime(%H:%M:%S, time.localtime())print(用时%s - %s % (start, end))# 关闭浏览器driver.quit()