当前位置：首页 > news >正文

垦利网站定制全网通官方网站

news 2025/11/15 4:47:54

垦利网站定制,全网通官方网站,学做面包到什么网站,中国建筑2022校园招聘Python 爬虫小练获取贝壳网数据使用到的模块标准库 Python3 标准库列表 os 模块#xff1a;os 模块提供了许多与操作系统交互的函数#xff0c;例如创建、移动和删除文件和目录#xff0c;以及访问环境变量等。math 模块#xff1a;math 模块提供了数学函数#xf…Python 爬虫小练获取贝壳网数据使用到的模块标准库 Python3 标准库列表 os 模块os 模块提供了许多与操作系统交互的函数例如创建、移动和删除文件和目录以及访问环境变量等。math 模块math 模块提供了数学函数例如三角函数、对数函数、指数函数、常数等datetime 模块datetime 模块提供了更高级的日期和时间处理函数例如处理时区、计算时间差、计算日期差等logging 模块使用标准库提供的 logging API 最主要的好处是所有的 Python 模块都可能参与日志输出包括你自己的日志消息和第三方模块的日志消息。logging.config 模块可配置 logging 模块。它们的使用是可选的 — 要配置 logging 模块你可以使用这些函数也可以通过调用主 API (在 logging 本身定义) 并定义在 logging 或 logging.handlers 中声明的处理器。logging.handlers 模块这个包提供了以下有用的处理程序。请注意有三个处理程序类 (StreamHandler, FileHandler 和 NullHandler) 实际上是在 logging 模块本身定义的但其文档与其他处理程序一同记录在此。urllib 模块urllib 模块提供了访问网页和处理 URL 的功能包括下载文件、发送 POST 请求、处理 cookies 等threading 模块线程模块提供对线程的支持SQLite 3 模块SQLite 是一个C语言库它可以提供一种轻量级的基于磁盘的数据库这种数据库不需要独立的服务器进程也允许需要使用一种非标准的 SQL 查询语言来访问它。一些应用程序可以使用 SQLite 作为内部数据存储。可以用它来创建一个应用程序原型然后再迁移到更大的数据库。第三方库 requests 库 Python requests 是一个常用的 HTTP 请求库可以方便地向网站发送 HTTP 请求并获取响应结果。 requests 模块比urllib模块更简洁。官网地址Python requests BeautifulSoup 库是一个可以从HTML或XML文件中提取数据的Python库。官网地址BeautifulSoup 使用到的相关逻辑步骤请求URL 模拟浏览器 headers {User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 }URL编码 import urllib.parsebaseUrl https://nj.ke.com/ershoufang/url baseUrl 天润城/ encoded_url urllib.parse.quote(url, safe/:?)无用户认证 response requests.get(encoded_url, headersheaders)有用户认证(cookie) headers {User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36,Cookie: lianjia_token自己的具体值 }response requests.get(encoded_url, headersheaders)代理公司内部若存在代理需要配置。 proxies {https: http://111:8080}response requests.get(encoded_url, headersheaders, proxiesproxies)解析HTML soup BeautifulSoup(response.text, html.parser)取属性 soup.select(.title a)[0].attrs.get(href)取标签值 soup.select(.total span)[0].text.strip()下载图片资源 # urllib.request配置代理 urllib.request.install_opener(urllib.request.build_opener(urllib.request.ProxyHandler(proxies)) )urllib.request.urlretrieve(housingImgUrl,housingTypeImagePath)分析数据写入SQLite 3数据库建表执行脚本写入异常处理 conn sqlite3.connect(../db/identifier.sqlite, check_same_threadFalse) c conn.cursor()# 执行sql脚本 with open(../db/script/house_listing_price.sql) as sql_file:c.executescript(sql_file.read()) conn.commit()for house_info in house_info_list:sql finsert into house_listing_price values ( \f{house_info[houseid]} \f,{house_info[title]} \f,{house_info[price]} \f,{house_info[address]} \f,{house_info[area]} \f,{house_info[sealDate]} \f,{house_info[housingType]} \f,{house_info[houseUrl]})try:c.execute(BEGIN)c.execute(sql)c.execute(COMMIT)except:print([ str(datetime.datetime.now()) ] 写入数据库异常sql is [ sql ])c.execute(ROLLBACK) conn.commit() conn.close()完整示例 import requests from bs4 import BeautifulSoup import math import datetime import sqlite3 import urllib.request import os# 代理-公司用 proxies {https: http://xzproxy.cnsuning.com:8080} # 无代理 # proxies {}# 下载图片第三方配置代理 urllib.request.install_opener(urllib.request.build_opener(urllib.request.ProxyHandler(proxies)) )# 模拟浏览器请求的header headers {User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 }# 挂牌列表URL-不分页 url https://nj.ke.com/ershoufang/co22l2rs%E5%A4%A9%E6%B6%A6%E5%9F%8E%E5%8D%81%E5%9B%9B%E8%A1%97%E5%8C%BA/ response requests.get(url, headersheaders, proxiesproxies) soup BeautifulSoup(response.text, html.parser) # 网站每页30条 everypagecount 30 sumhouse soup.select(.total span)[0].text.strip() pagesum int(sumhouse) / everypagecount pagesum math.ceil(pagesum) # 网站只提供100页 pagesum min(pagesum, 100) print([ str(datetime.datetime.now()) ] 总记录数 str(sumhouse) ,总页数 str(pagesum)) # 创建一个空列表用于存储房源信息 house_info_list []# 请求房源列表数据 def requestUrl(real_url):response requests.get(real_url, headersheaders, proxiesproxies)soup BeautifulSoup(response.text, html.parser)# 获取房源列表数据house_list soup.select(.sellListContent li .clear)# 循环遍历房源列表提取所需信息for house in house_list:# 挂牌标题title house.select(.title a)[0].text.strip()# 挂牌价格price house.select(.totalPrice span)[0].text.strip()# 地址小区名称address house.select(.positionInfo a)[0].text.strip()# 楼层简述area house.select(.houseInfo)[0].text.strip().replace(\n, ).replace( , ).split(|)[0]area area[0:area.index()) 1]# 房屋登记编号houseId house.select(.unitPrice)[0].attrs.get(data-hid)# 房源详情页的URLhref house.select(.title a)[0].attrs.get(href)response2 requests.get(href, headersheaders, proxiesproxies)soup2 BeautifulSoup(response2.text, html.parser)# 挂牌日期sealDate soup2.select(.introContent .transaction li)[0].text.strip()[4:]# 户型housingType soup2.select(.introContent .base .content li)[0].text.strip()[4:].strip()# 房屋图片列表house_images_list soup2.select(.thumbnail .smallpic li)housingTypeImagePath ../src/main/resources/images/housingType/ houseId .jpgfor house_images in house_images_list:# 下载户型图if 户型图 house_images.attrs.get(data-desc) and not os.path.exists(housingTypeImagePath):housingImgUrl house_images.attrs.get(data-src)urllib.request.urlretrieve(housingImgUrl,housingTypeImagePath)# 将提取到的信息添加到房源信息列表中house_info_list.append({title: title,price: price,address: address,area: area,houseid: houseId,sealDate: sealDate,housingType: housingType,houseUrl: href})returnpageNo 0 while pageNo pagesum:currentPageNo str(pageNo 1)# 挂牌列表URL-分页url https://nj.ke.com/ershoufang/pg currentPageNo co22l2rs%E5%A4%A9%E6%B6%A6%E5%9F%8E%E5%8D%81%E5%9B%9B%E8%A1%97%E5%8C%BA/print([ str(datetime.datetime.now()) ] 获取第 currentPageNo 页)requestUrl(url)pageNo pageNo 1# 将房源信息列表保存为CSV文件 import csv# print(写入文件中) # current_date datetime.datetime.now() # formatted_date current_date.strftime(%Y-%m-%d) # filename house_info- formatted_date .csv # with open(filename, w, newline, encodingutf-8-sig) as f: # writer csv.writer(f) # writer.writerow([标题, 价格, 地址, 位置, 房屋ID]) # for house_info in house_info_list: # writer.writerow([ # house_info[title], house_info[price], house_info[address], # house_info[area], house_info[houseid] # ]) # print(写入完成)print([ str(datetime.datetime.now()) ] 写入数据库) conn sqlite3.connect(../db/identifier.sqlite, check_same_threadFalse) c conn.cursor()# 执行sql脚本 with open(../db/script/house_listing_price.sql) as sql_file:c.executescript(sql_file.read()) conn.commit()for house_info in house_info_list:sql finsert into house_listing_price values ( \f{house_info[houseid]} \f,{house_info[title]} \f,{house_info[price]} \f,{house_info[address]} \f,{house_info[area]} \f,{house_info[sealDate]} \f,{house_info[housingType]} \f,{house_info[houseUrl]})try:c.execute(BEGIN)c.execute(sql)c.execute(COMMIT)except:print([ str(datetime.datetime.now()) ] 写入数据库异常sql is [ sql ])c.execute(ROLLBACK) conn.commit() conn.close() print([ str(datetime.datetime.now()) ] 写入完成)

查看全文

http://www.zqtcl.cn/news/584497/