当前位置: 首页 > news >正文

招考网站开发绵阳建设网

招考网站开发,绵阳建设网,亿网联播,重庆市建设信息网站今天分享一些代码#xff0c;欢迎参考和学习#xff0c;在上一篇博客的基础上加入了多线程#xff0c;使得速度提升了十几倍#xff0c;代码如下#xff1a; import csv import random import re import threadingimport chardet import pandas as pd from bs4 import Be…今天分享一些代码欢迎参考和学习在上一篇博客的基础上加入了多线程使得速度提升了十几倍代码如下 import csv import random import re import threadingimport chardet import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver import concurrent.futures from datetime import datetime from tqdm import tqdm from urllib.parse import urljoin import requestschrome_options webdriver.ChromeOptions() # 添加其他选项如您的用户代理等 # ... chrome_options.add_argument(--headless) # 无界面模式可以加速爬取 # 指定 Chrome WebDriver 的路径 driver webdriver.Chrome(executable_path/usr/local/bin/chromedriver, optionschrome_options) ## 浏览器设置选项 # chrome_options Options() chrome_options.add_argument(blink-settingsimagesEnabledfalse)def get_time():获取随机时间return round(random.uniform(3, 6), 1)from random import choicedef get_user_agent():获取随机用户代理user_agents [Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727),# Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506),# Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727),Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US),# Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20,# Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; http://www.baidu.com/search/spider.html),Mozilla/5.0 (compatible; Baiduspider/2.0; http://www.baidu.com/search/spider.html)]## 在user_agent列表中随机产生一个代理作为模拟的浏览器user_agent choice(user_agents)return user_agentdef get_page(list_url):获取包含特定关键字的留言链接user_agent get_user_agent()headers {User-Agent: user_agent}# Make the request using the requests libraryresponse requests.get(list_url, headersheaders)if response.status_code 200:html_content response.text# 使用 BeautifulSoup 解析 HTMLsoup BeautifulSoup(html_content, html.parser)bump_elements soup.find_all(class_nump)# 提取数字并转换为整数nump_numbers [int(element.text) for element in bump_elements]# 找出最大的数if nump_numbers:max_nump max(nump_numbers)else:# 处理空序列的情况例如给 max_nump 赋一个默认值max_nump None # 或者其他你认为合适的默认值return max_numpelse:print(fError: {response.status_code})return Nonedef generate_urls(base_url, page_number, total_pages):urls []for page in range(2, total_pages 1, page_number):url f{base_url},f_{page}.htmlurls.append(url)return urlsdef get_detail_urls_by_keyword(urls):comment, link, reads, date [], [], [], []total_iterations len(urls)# Create a tqdm instance for the progress barprogress_bar tqdm(totaltotal_iterations, descProcessing URLs, position0, leaveTrue)# 在函数外定义一个锁lock threading.Lock()def process_url(url):nonlocal comment, link, reads, date获取包含特定关键字的留言链接user_agent get_user_agent()headers {User-Agent: user_agent}# Make the request using the requests libraryresponse requests.get(url, headersheaders)encoding chardet.detect(response.content)[encoding]# 解码响应内容if response.status_code 200:html_content response.content.decode(encoding)# 使用 BeautifulSoup 解析 HTMLsoup BeautifulSoup(html_content, html.parser)#print(html_content)# Extract and convert relative URLs to absolute URLswith lock:links []author_elements soup.select(div.author a)for element in author_elements:href element.get(href)if href:absolute_url urljoin(https:, href)links.append(absolute_url)links [https://i.eastmoney.com/ text.split(//i.eastmoney.com/)[-1].split()[0] for text inlinks]link [link[len(https://i.eastmoney.com/):] if link.startswith(https://i.eastmoney.com/) else link forlink in links]# Extract commentscomment_elements soup.select(div.reply)for element in comment_elements:message_id element.text.strip().split(:)[-1]comment.append(message_id)# Extract datespattern re.compile(r\d{1,2}-\d{1,2} \d{2}:\d{2})# Find all matches in the textdate pattern.findall(html_content)# Extract readsread_elements soup.select(div.read)for element in read_elements:message_id element.text.strip().split(:)[-1]reads.append(message_id)# Update the progress barprogress_bar.update(1)else:print(fError: {response.status_code})# Create threads for each URLthreads []for url in urls:thread threading.Thread(targetprocess_url, args(url,))thread.start()threads.append(thread)# Wait for all threads to completefor thread in threads:thread.join()# Close the progress barprogress_bar.close()return comment, link, reads, datedef extract_and_combine(url):match re.search(r\d{6}, url)if match:extracted_number match.group()result extracted_numberreturn resultelse:return Nonedef process_dates(date_list):processed_dates []current_year 2023for date_str in date_list:try:# Adjust the format string based on the actual format of your datadate_obj datetime.strptime(date_str, %m-%d %H:%M)# Check if processed_dates is not empty before accessing its last elementif processed_dates and date_obj.month processed_dates[-1].month:current_year - 1# Replace the year in date_obj with the updated current_yearprocessed_date date_obj.replace(yearcurrent_year)processed_dates.append(processed_date)except ValueError as e:print(fError processing date {date_str}: {e})return processed_datesdef write_to_csv_file(comment, link, reads, date, result):将数据写入 CSV 文件Parameters:comment (list): 评论数列表link (list): 链接列表title (list): 标题列表reads (list): 阅读数列表date (list): 日期列表result (str): 结果标识Returns:None# 指定 CSV 文件的路径csv_file_path result _评论.csv# 将数据写入 CSV 文件with open(csv_file_path, w, newline, encodingutf-8) as csv_file:csv_writer csv.writer(csv_file)# 写入表头csv_writer.writerow([评论数, 链接, 阅读数, 日期])# 写入数据csv_writer.writerows(zip(comment, link, reads, date))print(fCSV 文件已生成: {csv_file_path})def filter_and_append_links(comment, link):过滤评论数大于等于0的链接并添加到 final_link 列表中Parameters:comment (list): 包含评论数的列表link (list): 包含链接的列表Returns:final_link (list): 过滤后的链接列表final_link []for i in range(4, len(link)):comment_value int(comment[i])if comment_value 0:final_link.append(link[i])return final_linkdef remove_duplicates(input_list):unique_list []for item in input_list:if item not in unique_list:unique_list.append(item)return unique_listdef process_result_links(links):# 调用去重函数result_link remove_duplicates(links)# 使用循环和 remove 方法移除包含子列表的元素for item in result_link[:]: # 使用切片创建副本以防止在循环中修改原始列表if list in item:result_link.remove(item)return result_linkdef get_information_for_url(url):influence, age, location, fan [], [], [], []user_agent get_user_agent()headers {User-Agent: user_agent}# Make the request using the requests libraryresponse requests.get(url, headersheaders)if response.status_code 200:html_content response.text# 使用 BeautifulSoup 解析 HTMLprint(html_content)soup BeautifulSoup(html_content, html.parser)# 提取影响力信息# Extract agesage_elements soup.select(div.others_level p:contains(吧龄) span)for element in age_elements:age_text element.text.strip()age.append(age_text)# Extract locationslocation_elements soup.select(p.ip_info)for element in location_elements:text element.text.strip()match re.search(r([^?])\?, text)if match:ip_location match.group(1)location.append(ip_location)# Extract fansfan_elements soup.select(div.others_fans a#tafansa span.num)for element in fan_elements:message_id element.text.strip().split(:)[-1]fan.append(message_id)return influence, age, location, fanelse:print(fError: {response.status_code})return Nonedef get_information(urls):influence, age, location, fan [], [], [], []with concurrent.futures.ThreadPoolExecutor() as executor:results list(tqdm(executor.map(get_information_for_url, urls), totallen(urls), descProcessing URLs))for result in results:influence.extend(result[0])age.extend(result[1])location.extend(result[2])fan.extend(result[3])return age, location, fandef write_to_csv(result_link, age, location, fan, result):# 构建 CSV 文件名csv_filename result _用户.csv# 将数据封装成字典列表data [{链接: link, 吧龄: a, 属地: loc, 粉丝: f}for link, a, loc, f in zip(result_link, age, location, fan)]# 使用 csv 模块创建 CSV 文件并写入数据同时指定列名with open(csv_filename, w, newline) as csvfile:fieldnames [链接, 吧龄, 属地, 粉丝]writer csv.DictWriter(csvfile, fieldnamesfieldnames)# 写入列名writer.writeheader()# 写入数据writer.writerows(data)print(fData has been written to {csv_filename})def convert_to_guba_link(file_path):读取 Excel 文件中的六位数将其转换为股吧链接Parameters:file_path (str): Excel 文件路径Returns:guba_links (list): 转换后的股吧链接列表guba_links []try:# 读取 Excel 文件df pd.read_excel(file_path)# 获取第一列的数据six_digit_numbers df.iloc[:, 0]# 转换六位数为股吧链接for number in six_digit_numbers:# 使用 f-string 构建链接link fhttps://guba.eastmoney.com/list,{number:06d}.htmlguba_links.append(link)except Exception as e:print(fError: {e})return guba_linksdef main():主函数list_urls convert_to_guba_link(number.xlsx)print(爬虫程序开始执行---)i 2for list_url in list_urls:page 3print(总页数,page)page_number 1url_without_html list_url.replace(.html, )urls generate_urls(url_without_html, page_number, page)print(urls)comment, link, reads, date get_detail_urls_by_keyword(urls)print(comment)print(link)print(reads)print(date)date process_dates(date)result extract_and_combine(list_url)write_to_csv_file(comment, link, reads, date, result)link process_result_links(link)age, location, fan get_information(link)print(age)print(location)print(fan)write_to_csv(link, age, location, fan, result)print(抓取完个数,i)i i 1if __name__ __main__:执行主函数main()
http://www.zqtcl.cn/news/39829/

相关文章:

  • 龙岗网站建设公司网络服务芜湖建设网站
  • 淘宝联盟个人网站怎么做网络规划设计师考试考点分析网盘
  • 域名注册以后怎样做网站网站登录系统怎样做
  • 安康网站制作公司旅游网站开发开题报告
  • 简单网站建设软件有哪些方面台州网站建设系统
  • 牡丹江市建设行业协会网站移动互联网开发是做什么的?
  • 手机网站开发模拟手机常见服务器
  • 公司网站维护价格表2023ui设计是怎么实现的
  • 华强北网站建设设计福州seo排名优化公司
  • 丰田车营销网站建设的纲要计划书什么是网站空间信息
  • 自己搞个网站哪里购买网站广告位
  • 临潼建设项目环境影响网站专注于网络推广及网站建设
  • 网站建设佰金手指科杰十一关于网站建设的通知
  • 德网站建设做静态网站的开题报告
  • 网站建设主流技术及效果wordpress社
  • 小程序制作报价新乡搜索引擎优化
  • 有什么简单的网站项目大学网站建设的目标与思路
  • 国内个人网站欣赏网页设计怎么建立网站
  • 网站托管服务方案广告宣传册制作公司
  • 建网站的论坛建设网站的一般步骤是
  • 自己电脑做网站服务器小工具贵阳做企业网站
  • 优客逸家网站建设百度关键词搜索推广
  • 网站制作问题 图片版权哪些网站可以做设计软件
  • 网站制作备案上线流程wordpress+搜索
  • 上海营销平台网站建设金融网站搭建
  • 网站建设数字的代码编写什么是建设网站工具
  • 靖江市建设局网站wordpress 调用站外api
  • 昆明网站多端小程序设计一个公司能备案多个网站吗
  • 网站可以改内链结构吗实现微信绑定登录网站
  • 免费私人网站建设平台php网站的数据库怎么做备份