pic cms图片网站管理系统手机版,个人网站实例,帝国 cms 网站关键字,重庆新闻频道天天630直播人立晚风月照中 独散步长廊 月浸在池塘 欢欣充满了心上 静听乐悠扬 越觉乐洋洋 夜鸟高枝齐和唱 月照彩云上 熏风轻掠 如入山荫心向往 #x1f3b5; 苏妙玲《彩云追月》 import timeimport requests
from playwright._impl._errors import TimeoutError
f…
人立晚风月照中 独散步长廊 月浸在池塘 欢欣充满了心上 静听乐悠扬 越觉乐洋洋 夜鸟高枝齐和唱 月照彩云上 熏风轻掠 如入山荫心向往 苏妙玲《彩云追月》 import timeimport requests
from playwright._impl._errors import TimeoutError
from playwright.sync_api import sync_playwright
from loguru import logger
from scrapy import Selector
import re
from urllib.parse import urlparsedef extract_html_text(html):logger.info(fWorker-0-run-extract_html_text)selector Selector(texthtml)# 去掉空格和换行符text selector.xpath(//body//text()).re(r\S|\n)text .join(text)charlist re.findall([\u4e00-\u9fa5], text)text .join(charlist)if text:return text.strip()else:return def extract_html(url):# 启动Playwrightwith sync_playwright() as p:# 选择浏览器引擎chromium、firefox、webkitbrowser p.chromium.launch(headlessTrue, # 设置为False以便在界面模式下启动浏览器executable_pathr/Applications/Chromium.app/Contents/MacOS/Chromium # 设置浏览器路径# executable_pathr/Applications/Google Chrome.app/Contents/MacOS/Google Chrome # 设置浏览器路径)# headers {# User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3,# }context browser.new_context()pageInit context.new_page()page context.new_page()page.set_default_timeout(15000)page.goto(url)# 等待7stime.sleep(5)html page.content()# print(html)page.close()return htmldef crawl(url):html try:html extract_html(url)except TimeoutError as e:logger.info(页面访问超时检查网络连接或者网站是否正常)except Exception as e:logger.error(页面访问异常)response Selector(texthtml)values response.xpath(//footer/aria-label).extract_first()share, comment, like values.split(,)print(share, comment, like)def is_valid_http_url(url):parsed_url urlparse(url)return parsed_url.scheme in (http, https)def request_url(wId):url fhttps://weibo.com/ajax/statuses/show?id{wId}localezh-CNheaders {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3,Cookie: SUB_2AkMRHTAuf8NxqwFRmfscyW7na4Rzzw3EieKnQcH1JRMxHRl-yT9kqnBetRB6Op0ewXqJGg99xI9PHf9GLxIl4ywMtbjK;}resp requests.get(url, headersheaders)data resp.json()reposts_count data[reposts_count]comments_count data[comments_count]attitudes_count data[attitudes_count]print(reposts_count, comments_count, attitudes_count)if __name__ __main__:url https://weibo.com/1731986465/Oe4y0u9Pn?refer_flag1001030103_ret crawl(url)wId url.split(/)[-1].split(?)[0]request_url(wId)