做网站是做广告吗,新乡网站建设求职简历,网站建设 邯郸网站制作,网站建设公司大全摘要
本报告利用Java和Selenium爬虫技术获取数据#xff0c;并使用ECharts库对薪资数据进行可视化分析#xff0c;旨在探究不同经验和学历的薪资分布情况。
数据来源
数据来源于Boss直聘#xff0c;使用Java结合Selenium库进行数据抓取。
数据总数#xff1a;约2000家企…摘要
本报告利用Java和Selenium爬虫技术获取数据并使用ECharts库对薪资数据进行可视化分析旨在探究不同经验和学历的薪资分布情况。
数据来源
数据来源于Boss直聘使用Java结合Selenium库进行数据抓取。
数据总数约2000家企业数据数据类型java岗位、全栈、前端数据地区深圳、广州
数据清洗
比如15-30K·13薪清洗为3个字段分别存储 UPDATE boss_index
SETsalaryLowest (SUBSTRING_INDEX(salaryDesc, -, 1)),salaryHighest (SUBSTRING_INDEX(SUBSTRING_INDEX(salaryDesc, -, -1), K, 1)),salaryMonth (CASEWHEN salaryDesc LIKE %·% THENREPLACE(SUBSTRING_INDEX(salaryDesc, ·, -1), 薪, )ELSENULLEND);数据分析
不同学历、不同经验、不同地区薪资分布使用中位数和众数进行可以实话展示中位数众数
结果展示
tips:数据y轴大于100结果为xx元/天
核心代码
爬虫
ChromeOptions ops new ChromeOptions();
ops.addArguments(--remote-allow-origins*);
System.setProperty(webdriver.chrome.driver, driver/chromedriver.exe); //chromedriver.exe存放的路径
System.setProperty(webdriver.chrome.whitelistedIps, );
ChromeDriver driver new ChromeDriver(ops);
driver.get(https://www.zhipin.com/web/geek/job?query%E5%85%A8%E6%A0%88%E5%B7%A5%E7%A8%8B%E5%B8%88city101280100);
driver.manage().window().maximize();数据分析sql
中位数
select idgetModeSalaryHighest resultTypecom.example.springboot.dto.MedianSalaryResultDTOSELECTtag AS group_tag_inner,salaryHighest AS mode_salaryHighestFROM(SELECTtag,salaryHighest,COUNT(*) AS countFROMboss_indexWHEREsalaryHighest IS NOT NULLGROUP BYtag,salaryHighest) AS salary_highest_countsJOIN(SELECTtag AS tag_max_count,MAX(count) AS max_countFROM(SELECTtag,salaryHighest,COUNT(*) AS countFROMboss_indexWHEREsalaryHighest IS NOT NULLif testjobName ! nulland jobName like concat(%, #{jobName}, %)/ifif testareaDistrict ! nulland areaDistrict like concat(%, #{areaDistrict}, %)/ifif testeducationLabel ! nulland education_label like concat(%, #{educationLabel}, %)/ifGROUP BYtag,salaryHighest) AS subqueryGROUP BYtag) AS max_count_highest ON salary_highest_counts.tag max_count_highest.tag_max_count AND salary_highest_counts.count max_count_highest.max_countGROUP BYsalary_highest_counts.tag,salary_highest_counts.salaryHighest/select众数 select idgetMedianSalarieshigh resultMapMedianSalaryResultMapSELECTtag AS group_tag_inner,salaryHighest AS median_salaryHighestFROM(SELECTtag,salaryHighest,rowindex : IF(group_tag tag, rowindex 1, 1) AS rowindex, -- 按tag分组累加行号group_tag : tag AS group_tag -- 更新tagFROMboss_index,(SELECT rowindex : 0, group_tag : ) var_init -- 初始化变量WHEREsalaryHighest IS NOT NULLORDER BYtag,salaryHighest) AS ranked_salariesJOIN(SELECTtag AS tag_total_rows,COUNT(*) AS total_rowsFROMboss_indexWHEREsalaryHighest IS NOT NULLif testjobName ! nulland jobName like concat(%, #{jobName}, %)/ifif testareaDistrict ! nulland areaDistrict like concat(%, #{areaDistrict}, %)/ifif testeducationLabel ! nulland education_label like concat(%, #{educationLabel}, %)/ifGROUP BYtag) AS total_rows ON ranked_salaries.tag total_rows.tag_total_rowsWHERErowindex IN (FLOOR((total_rows 1) / 2), FLOOR((total_rows 2) / 2)) -- 使用总行数变量来确定中间的行号GROUP BYgroup_tag_inner/select