腾讯cdn wordpress插件,网站视觉优化怎么做,四川seo优化,合肥最新通告今天hive sql 行列转换 开窗函数 炸裂函数
准备原始数据集 学生表 student.csv 讲师表 teacher.csv 课程表 course.csv 分数表 score.csv 员工表 emp.csv 雇员表 employee.csv 电影表 movie.txt 学生表 student.csv
001,彭于晏,1995-05-16,男
002,胡歌,1994-03-20,男
003,周杰伦,…hive sql 行列转换 开窗函数 炸裂函数
准备原始数据集 学生表 student.csv 讲师表 teacher.csv 课程表 course.csv 分数表 score.csv 员工表 emp.csv 雇员表 employee.csv 电影表 movie.txt 学生表 student.csv
001,彭于晏,1995-05-16,男
002,胡歌,1994-03-20,男
003,周杰伦,1995-04-30,男
004,刘德华,1998-08-28,男
005,唐国强,1993-09-10,男
006,陈道明,1992-11-12,男
007,陈坤,1999-04-09,男
008,吴京,1994-02-06,男
009,郭德纲,1992-12-05,男
010,于谦,1998-08-23,男
011,潘长江,1995-05-27,男
012,杨紫,1996-12-21,女
013,蒋欣,1997-11-08,女
014,赵丽颖,1990-01-09,女
015,刘亦菲,1993-01-14,女
016,周冬雨,1990-06-18,女
017,范冰冰,1992-07-04,女
018,李冰冰,1993-09-24,女
019,邓紫棋,1994-08-31,女
020,宋丹丹,1991-03-01,女讲师表 teacher.csv
1001,张高数
1002,李体音
1003,王子文
1004,刘丽英课程表 course.csv
01,语文,1003
02,数学,1001
03,英语,1004
04,体育,1002
05,音乐,1002分数表 score.csv
001,01,94
002,01,74
004,01,85
005,01,64
006,01,71
007,01,48
008,01,56
009,01,75
010,01,84
011,01,61
012,01,44
013,01,47
014,01,81
015,01,90
016,01,71
017,01,58
018,01,38
019,01,46
020,01,89
001,02,63
002,02,84
004,02,93
005,02,44
006,02,90
007,02,55
008,02,34
009,02,78
010,02,68
011,02,49
012,02,74
013,02,35
014,02,39
015,02,48
016,02,89
017,02,34
018,02,58
019,02,39
020,02,59
001,03,79
002,03,87
004,03,89
005,03,99
006,03,59
007,03,70
008,03,39
009,03,60
010,03,47
011,03,70
012,03,62
013,03,93
014,03,32
015,03,84
016,03,71
017,03,55
018,03,49
019,03,93
020,03,81
001,04,54
002,04,100
004,04,59
005,04,85
007,04,63
009,04,79
010,04,34
013,04,69
014,04,40
016,04,94
017,04,34
020,04,50
005,05,85
007,05,63
009,05,79
015,05,59
018,05,87员工表 emp.csv
7369,张三,研发,800.00,30
7499,李四,财务,1600.00,20
7521,王五,行政,1250.00,10
7566,赵六,销售,2975.00,40
7654,侯七,研发,1250.00,30
7698,马八,研发,2850.00,30
7782,金九,行政,2450.0,30
7788,银十,行政,3000.00,10
7839,小芳,销售,5000.00,40
7844,小明,销售,1500.00,40
7876,小李,行政,1100.00,10
7900,小元,讲师,950.00,30
7902,小海,行政,3000.00,10
7934,小红明,讲师,1300.00,30
7934,小红,讲师,1300.00,雇员表 employee.csv
张无忌,男,1980/02/12,2022/08/09,销售,3000,12000,阿朱_小昭,张小无:8_张小忌:9
赵敏,女,1982/05/18,2022/09/10,行政,9000,2000,阿三_阿四,赵小敏:8
宋青书,男,1981/03/15,2022/04/09,研发,18000,1000,王五_赵六,宋小青:7_宋小书:5
周芷若,女,1981/03/17,2022/04/10,研发,18000,1000,王五_赵六,宋小青:7_宋小书:5
郭靖,男,1985/03/11,2022/07/19,销售,2000,13000,南帝_北丐,郭芙,5_郭襄:4
黄蓉,女,1982/12/13,2022/06/11,行政,12000,null,东邪_西毒,郭芙,5_郭襄:4
杨过,男,1988/01/30,2022/08/13,前台,5000,null,郭靖_黄蓉,杨小过:2
小龙女,女,1985/02/12,2022/09/24,前台,6000,null,张三_李四,杨小过:2电影表 movie.txt
《疑犯追踪》-悬疑,动作,科幻,剧情
《Lie to me》-悬疑,警匪,动作,心理,剧情
《战狼2》-战争,动作,灾难订单表 order.csv
1,1001,小元,2022-01-01,10
2,1002,小海,2022-01-02,15
3,1001,小元,2022-02-03,23
4,1002,小海,2022-01-04,29
5,1001,小元,2022-01-05,46
6,1001,小元,2022-04-06,42
7,1002,小海,2022-01-07,50
8,1001,小元,2022-01-08,50
9,1003,小辉,2022-04-08,62
10,1003,小辉,2022-04-09,62
11,1004,小猛,2022-05-10,12
12,1003,小辉,2022-04-11,75
13,1004,小猛,2022-06-12,80
14,1003,小辉,2022-04-13,94创建数据库和数据表
create database chap06;
use chap06;-- 学生表 student.csv
create external table student (stu_id string comment 学生ID,stu_name string comment 学生姓名,birthday string comment 出生日期,gender string comment 学生性别
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz03/student;load data local inpath /root/data/data02/student.csv overwrite into table student;select * from student;-- 讲师表 teacher.csv
create external table teacher (tea_id string comment 课程ID,tea_name string comment 课程名称
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz03/teacher;load data local inpath /root/data/data02/teacher.csv overwrite into table teacher;select * from teacher;-- 课程表 course.csv
create external table course (course_id string comment 课程ID,course_name string comment 课程名称,tea_id string comment 讲师ID
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz03/course;load data local inpath /root/data/data02/course.csv overwrite into table course;select * from course;-- 分数表 score.csv
create external table score (stu_id string comment 学生ID,course_id string comment 课程ID,score int comment 成绩
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz03/score;load data local inpath /root/data/data02/score.csv overwrite into table score;
select * from score;-- 员工表 emp.csv
create external table emp (emp_id int comment 员工ID,emp_name string comment 员工姓名,emp_job string comment 员工岗位,emp_salary decimal(8,2) comment 员工薪资,dept_id int comment 员工隶属部门ID
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz01/emp;
load data local inpath /root/data/data02/emp.csv overwrite into table emp;
select * from emp;-- 雇员表 employee.csv
create external table employee(name string comment 姓名,sex string comment 性别,birthday string comment 出生年月,hiredate string comment 入职日期,job string comment 岗位,salary int comment 薪资,bonus int comment 奖金,friends arraystring comment 朋友,children mapstring,int comment 孩子
)row format delimited fields terminated by ,collection items terminated by _map keys terminated by :lines terminated by \nstored as textfilelocation /quiz04/employee;
load data local inpath /root/data/data02/employee.csv into table employee;
select * from employee;-- 电影表 movie.txt
create external table movie(name string comment 电影名称,category string comment 电影分类
)row format delimited fields terminated by -lines terminated by \nstored as textfilelocation /quiz04/movie;
load data local inpath /root/data/data02/movie.txt into table movie;
select * from movie;-- 订单表 order.csv
create external table order
(order_id string comment 订单id,user_id string comment 用户id,user_name string comment 用户姓名,order_date string comment 下单日期,order_amount int comment 订单金额
)row format delimited fields terminated by ,lines terminated by \nstored as textfilelocation /quiz04/order;
load data local inpath /root/data/data02/order.csv into table order;
select * from order;行列转换
列转行
create table test (stu_name string,course_name string,score int
);
insert into test values (张三,语文,80),(张三,数学,90), (李四,语文,85),(李四,数学,95);
select * from test;select stu_name,max(case when course_name 语文 then score end) as yuwen,max(case when course_name 数学 then score end) as shuxuefrom test group by stu_name;-- 多个值转为集合 collect_list 不会去重
select collect_list(emp_job) job_list from emp;
-- 多个值转为集合 collect_set 会去重
select collect_set(emp_job) job_set from emp;
-- size 获取结合中元素的数量
select size(collect_set(emp_job)) job_count from emp;
-- concat_ws 将多个数据 以分隔符形式 拼接 concat_ws(分隔符,数据1,数据2,...)
select concat_ws(-,collect_set(emp_job)) job_string from emp;
-- split 字符串切分 以分隔符切分字符串 为集合
select split(concat_ws(-,collect_set(emp_job)),-) job_item from emp;行专列
create table sales (emp_name string,january int,february int,march int
);
insert into sales values (张三,1000,2000,3000),(李四,1500,2500,3500);
select * from sales;将转换后的结果还原 select t1.emp_name,sale_list[0] january,sale_list[1] february,sale_list[2] marchfrom(select t.emp_name,collect_list(sale) sale_list from(select emp_name,january yue, january sale from salesunion allselect emp_name,february yue,february sale from salesunion allselect emp_name,march yue,march sale from sales) tgroup by t.emp_name) t1;UDF UDTF UDAF UDF即用户定义函数(user-defined function)作用于单行数据并且产生一个数据行作为输出。 Hive中大多数函数都属于这一类比如数学函数和字符串函数。UDF函数的输入与输出值是1:1关系。 UDTF即用户定义表生成函数user-defined table-generating function 作用于单行数据并且产生多个数据行。UDTF函数的输入与输出值是1:n的关系。 UDAF用户定义聚集函数user-defined aggregate function作用于多行数据产生一个输出数据行。 Hive中像COUNT、MAX、MIN和SUM这样的函数就是聚集函数。UDAF函数的输入与输出值是n:1的关系。 explode array select explode(array(java,python,scala,go)) as course;map select explode(map(name,李昊哲,gender,1)) as (key,value);posexplode
select posexplode(array(java,python,scala,go)) as (pos,course);inline
select inline(array(named_struct(id,1,name,李昊哲,gender,1),named_struct(id,2,name,李哲,gender,0),named_struct(id,3,name,李大宝,gender,1)))as (id,name,gender);lateral view
select * from employee lateral view explode(friends) t as friend;select * from employee lateral view explode(children) t as children_name,children_age;select * from employeelateral view explode(friends) t1 as friendlateral view explode(children) t2 as children_name,children_age;select name, sex, birthday, hiredate, job, salary, bonus, friend,children_name,children_age from employee elateral view explode(friends) t1 as friendlateral view explode(children) t2 as children_name,children_age;UDTF 案例 根据电影信息表统计各分类的电影数量 select cate,count(name) as quantity from movielateral view explode(split(category,,)) tmp as categroup by cate;窗口函数(开窗函数) 能为每行数据划分一个窗口然后对窗口范围内的数据进行计算最后将计算结果返回给该行 Function(arg1,…, argn) OVER ([PARTITION BY …] [ORDER BY …] [window_expression]) 其中Function(arg1,…, argn) 可以是下面分类中的任意一个 聚合函数比如sum max min avg count等 分析函数比如lead lag first_value last_value等 排序函数比如row_number rank dense_rank等 OVER [PARTITION BY …] 类似于group by 用于指定分组 每个分组你可以把它叫做窗口 如果没有PARTITION BY 那么整张表的所有行就是一组 [ORDER BY …] 用于指定每个分组内的数据排序规则 支持ASC、DESC [window_expression] 用于指定每个窗口中 操作的数据范围 默认是窗口中所有行 聚合函数 聚合函数 rows 基于行 range 基于值 函数() over(rows between and 3) unbounded preceding 表示从前面的起点number preceding 往前current row 当前行number following 往后unbounded following 表示到后面的终点 统计每个用户截至每次下单的累计下单总额
select *,sum(order_amount) over (partition by user_id ,substr(order_date,1,7)order by order_daterows between unbounded preceding and current row) sum_order_amountfrom order;select *,sum(order_amount) over (partition by user_id ,substr(order_date,1,7)order by order_daterows unbounded preceding) sum_order_amountfrom order;统计每个用户截至每次下单的当月累积下单总额
select *,sum(order_amount) over (partition by user_id ,substr(order_date,1,7)order by order_daterows between unbounded preceding and unbounded following) sum_order_amountfrom order;最近三笔订单总金额 当前订单金额与前两笔订单金额的总和当前订单金额与后两笔订单金额的总和当前订单金额与前一笔订单和后一笔订单金额的总和 当前订单金额与前两笔订单金额的总和
select *,sum(order_amount) over (partition by user_idorder by order_daterows 2 preceding) sum_order_amountfrom order;当前订单金额与后两笔订单金额的总和
select *,sum(order_amount) over (partition by user_idorder by order_daterows 2 following) sum_order_amountfrom order;当前订单金额与前一笔订单和后一笔订单金额的总和
select *,sum(order_amount) over (partition by user_idorder by order_daterows between 1 preceding and 1 following) sum_order_amountfrom order;分析函数 lag lead first_value last_value
lag lead lag() over() 与 lead() over() 函数是跟偏移量相关的两个分析函数 通过这两个函数可以在一次查询中取出同一字段的前 N 行的数据 (lag) 和后 N 行的数据 (lead) 作为独立的列, 从而更方便地进行进行数据过滤。这种操作可以代替表的自联接并且 LAG 和 LEAD 有更高的效率。 over() 表示 lag() 与 lead() 操作的数据都在 over() 的范围内可以使用 partition by 语句用于分组 order by 语句用于排序。 partition by a order by b 表示以 a 字段进行分组再 以 b 字段进行排序对数据进行查询。 例如lag(field, num, defaultvalue) field 需要查找的字段num 往前查找的 num 行的数据defaultvalue 没有符合条件的默认值 例如lead(field, num, defaultvalue) field 需要查找的字段num 往后查找的 num 行的数据defaultvalue 没有符合条件的默认值 统计每个用户每次下单距离上次下单相隔的天数首次下单按0天算
select order_id, user_id, user_name, order_date, order_amount from (select order_id, user_id, user_name, order_date, order_amount,lag(order_date,1,order_date) over (partition by user_id order by order_date) pre_order_datefrom order) t where datediff(order_date,pre_order_date) 0;每个用户每个月首笔订单时间
select order_id, user_id, user_name, order_date, order_amount from (select order_id, user_id, user_name, order_date, order_amount,lag(order_date,1,order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) pre_order_datefrom order) t where datediff(order_date,pre_order_date) 0;每个用户每个月最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount from (select order_id, user_id, user_name, order_date, order_amount,lead(order_date,1,order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) next_order_datefrom order) t where datediff(order_date,next_order_date) 0;每个岗位先先入职的远哥和后入在的员工工资差
select name, sex, birthday, hiredate, job, salary, bonus, friends, children, new_salary,(salary - new_salary) salary_diff from (select name, sex, birthday, hiredate, job, salary, bonus, friends, children,lead(salary,1,salary) over (partition by job order by hiredate) new_salaryfrom employee) t;first_value last_value first_value 取每个分区内某列的第一个值 语法first_value(col,true/false) over (partition by col1 order by col2) 第二个参数为true跳过空值默认为false last_value 取每个分区内某列的最后一个值 语法last_value(col,true/false) over (partition by col1 order by col2) 第二个参数为true跳过空值默认为false 每个用户每个月首笔订单时间
select order_id, user_id, user_name, order_date, order_amount,first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_valuefrom order;每个用户每个月最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount,last_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_daterows between current row and unbounded following) last_order_valuefrom order;每个用户每个月首笔订单时间和最后笔订单时间
select order_id, user_id, user_name, order_date, order_amount,first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_value,last_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_daterows between current row and unbounded following) last_order_valuefrom order;select order_id, user_id, user_name, order_date, order_amount, first_order_value, last_order_value from(select order_id, user_id, user_name, order_date, order_amount,first_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_date) first_order_value,last_value(order_date) over (partition by user_id,substr(order_date,1,7) order by order_daterows between current row and unbounded following) last_order_valuefrom order) t where order_date first_order_value or order_date last_order_value;排序函数
分组排序取TopN 查询各科成绩前五名的学生 select a.course_id,a.stu_id,a.score from score aleft join score bon a.course_id b.course_id and a.score b.scoregroup by a.stu_id,a.course_id,a.scorehaving count(a.stu_id) 5order by a.course_id,a.score desc;select S1.course_id,s1.stu_id,s1.score from score s1 where(select count(*) from score s2where s2.course_ids1.course_id AND s2.score s1.score) 5 order by s1.course_id,s1.score desc;row_number row_number() over () 连续序号 over()里头的分组以及排序的执行晚于 where 、group by、order by 的执行。 select * from(select course_id, stu_id, score,row_number() over (partition by course_id order by score desc ) as mumfrom score) t where mum 5;rank rank() over () 排名 跳跃排序 序号不是连续的 select * from(select course_id, stu_id, score,rank() over (partition by course_id order by score desc ) as mumfrom score) t where mum 5;dense_rank dense_rank() over () 排名 连续排序 select * from(select course_id, stu_id, score,dense_rank() over (partition by course_id order by score desc ) as mumfrom score) t where mum 5;每个月每个消费总金额前三名的用户
select order_id, user_id, user_name, order_date, order_amount, total_order_amount, rank_total_order_amount from
(select order_id, user_id, user_name, order_date, order_amount, total_order_amount,dense_rank() over (partition by substr(order_date,1,7) order by total_order_amount desc) rank_total_order_amountfrom (select order_id, user_id, user_name, order_date, order_amount,sum(order_amount) over(partition by substr(order_date,1,7),user_id order by order_daterows between unbounded preceding and unbounded following) total_order_amountfrom order) t) t1 where rank_total_order_amount 3;