书店网站建设,网站被k 如何恢复,什么是搜索引擎推广,企业展厅设计公司信息主要是分析网站图片的html源代码#xff0c;来决定正则表达式如何写。
完整代码
#使用正则表达式爬取多张图片,亮点在于数据解析
#爬取网站#xff1a;https://www.bilibili.com/read/cv11323037?fromsearch
import requests
import re
import os
image_pathimage
if not …主要是分析网站图片的html源代码来决定正则表达式如何写。
完整代码
#使用正则表达式爬取多张图片,亮点在于数据解析
#爬取网站https://www.bilibili.com/read/cv11323037?fromsearch
import requests
import re
import os
image_pathimage
if not os.path.exists(image_path):os.makedirs(image_path)urlhttps://www.bilibili.com/read/cv11323037?fromsearch
header{User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48
}
rrequests.get(urlurl,headersheader).text
with open(1.html,w,encodingutf-8) as f:f.write(r)
#使用正则表达式数据解析
#img data-src//i0.hdslb.com/bfs/article/45cb84438212c280a5cc22dc6243d4d662a2a535.jpg width992 height700 data-size226676 classnormal-img data-index0 data-typepreview stylewidth: 628px; height: 444px; src//i0.hdslb.com/bfs/article/45cb84438212c280a5cc22dc6243d4d662a2a535.jpg785w_555h_progressive.webp
patternimg data-src(.*?) width
listre.findall(pattern,r,re.S)
print(list)
#解析完毕
for src in list:image_urlhttps:srcrrequests.get(urlimage_url,headersheader).contentimage_namesrc.split(/)[-1]with open(os.path.join(image_path,image_name),wb) as f:f.write(r)print({}已爬取完毕.format(image_name))