关于网站建设与维护论文,有没有做相册的网站,wordpress 微信会员系统,WordPress图床工具背景#xff1a;这是尚硅谷SparkSQL练习题#xff0c;本文用HiveSQL进行了实现。 数据集#xff1a;用户点击表#xff0c;商品表#xff0c;城市表 题目: ① 求每个地区点击量前三的商品#xff1b; ② 在①的基础上#xff0c;求出每个地区点击量前三的商品后这是尚硅谷SparkSQL练习题本文用HiveSQL进行了实现。 数据集用户点击表商品表城市表 题目: ① 求每个地区点击量前三的商品 ② 在①的基础上求出每个地区点击量前三的商品后求出每个商品中的点击量前三的城市分别占本商品总点击量的百分比。 建表导入数据 没啥说的建表语句直接抄过来
use atguigu;CREATE TABLE user_visit_action(date string,user_id bigint,session_id string,page_id bigint,action_time string,search_keyword string,click_category_id bigint,click_product_id bigint,order_category_ids string,order_product_ids string,pay_category_ids string,pay_product_ids string,city_id bigint)
row format delimited fields terminated by \t;load data local inpath datas/user_visit_action.txt
into table atguigu.user_visit_action;CREATE TABLE product_info(product_id bigint,product_name string,extend_info string)
row format delimited fields terminated by \t;load data local inpath datas/product_info.txt into table atguigu.product_infoCREATE TABLE city_info(city_id bigint,city_name string,area string)
row format delimited fields terminated by \t;load data local inpath datas/city_info.txt into table atguigu.city_info;查询
第一问求每个地区点击量前三的商品 分析按 areaproduct_name 两个字段分组求出点击量click_ct2保留每个地区点击量前三的商品。
selectarea,product_name,click_ct2
from (selectarea,product_name,click_ct2,row_number() over( partition by area order by click_ct2 desc ) as rn2from (selectarea,product_name,count(*) as click_ct2from (selecta.*,p.product_name,c.area,c.city_namefrom user_visit_action ajoin product_info p on a.click_product_id p.product_idjoin city_info c on a.city_id c.city_idwhere a.click_product_id -1) t1 group by area, product_name) t2
) t3
where rn2 3第二问 在①的基础上求出每个地区点击量前三的商品后求每个商品中的点击量前三的城市分别占本商品总点击量的百分比。
分析 第一问求出了每个地区、每个商品的点击量这个点击量叫click_ct2并取了每个地区前三名的商品。 在第二问中可以先求出每个地区、每个商品、每个城市的点击量这个点击量叫click_ct3取每个地区、每个商品点击量的前三名城市。
用click_ct3/click_ct2就是每个地区、每个商品、每个城市点击率这个点击率叫click_rate3。 click_ct3所在的临时表叫tmp1click_ct2所在的临时表叫tmp2。 tmp1有三个维度粒度更细数据条数会更多tmp2有两个维度粒度粗数据条数少。 无论是tmp1 join tmp2还是tmp2 join tmp1两种方式都可以tmp1中不符合条件的数据会被筛掉。
之后用concat()将每行的城市名和点击率拼接在一起 再按地区、商品、总点击量进行分组用collect_set()收集每组拼接的结果 将收集的结果拼接成字符串再转换成map。
--维度area,city_name,product_name
--度量点击次数
--限定前三
with tmp1 as(select area,product_name,city_name,click_ct3from(select area,city_name,product_name,click_ct3,row_number()over(partition by area,product_name order by click_ct3) rn1from(select area,city_name,product_name,count(*) click_ct3from(selecta.*,p.product_name,c.area,c.city_namefrom user_visit_action ajoin product_info p on a.click_product_id p.product_idjoin city_info c on a.city_id c.city_idwhere a.click_product_id -1)t1group by area,city_name,product_name)t2)t3where rn13order by area,product_name,city_name,click_ct3 desc
),
--维度area,product_name
--度量点击次数
--限定前三
tmp2 as(selectarea,product_name,click_ct2from (selectarea,product_name,click_ct2,row_number() over( partition by area order by click_ct2 desc ) as rn2from (selectarea,product_name,count(*) as click_ct2from (selecta.*,p.product_name,c.area,c.city_namefrom user_visit_action ajoin product_info p on a.click_product_id p.product_idjoin city_info c on a.city_id c.city_idwhere a.click_product_id -1) t1 group by area, product_name) t2) t3 where rn2 3
)
select area,product_name,click_ct2,-- 按地区、商品、总点击量进行分组用collect_set()收集每组拼接的结果-- 将收集后的结果转换成mapstr_to_map(concat_ws(,,collect_set(city_rate)),,,:) city_rate3
from(select area,product_name,click_ct2,click_rate,-- 将每行的城市名和点击率拼接在一起concat(city_name,:,click_rate,%) city_ratefrom(select tmp1.area,tmp1.product_name,tmp1.city_name,tmp2.click_ct2,round(tmp1.click_ct3*100/tmp2.click_ct2,2) click_ratefrom tmp2 join tmp1 on tmp2.areatmp1.area and tmp2.product_nametmp1.product_name)t1order by area,click_ct2 desc,click_rate desc
)t2
group by area,product_name,click_ct2
order by area,click_ct2 desc