创建两个表 并导入本地数据create table t_a(name string,num int)
row format delimited
fields terminated by ',';
create table t_b(name string,nick string)
row format delimited
fields terminated by ',';
导入数据
load data local inpath ‘/root/hive1/a.txt’ into table t_a;
load data local inpath ‘/root/hive1/b.txt’ into table t_b;
本地数据
a.txta,1
b,2
c,3
d,4
b.txta,xx
b,yy
d,zz
e,pp
各类连接
1.内连接(inner join)
笛卡儿积(每种出现的方式)select a.*,b.* from t_a a join t_b b;
结果a.name a.num b.name b.nick
a 1 a xx
b 2 a xx
c 3 a xx
d 4 a xx
a 1 b yy
b 2 b yy
c 3 b yy
d 4 b yy
a 1 d zz
b 2 d zz
c 3 d zz
d 4 d zz
a 1 e pp
b 2 e pp
c 3 e pp
d 4 e pp
2.指定join条件select a.*,b.* from t_a a join t_b b on a.name=b.name;
a.name a.num b.name b.nick |
3.join左外连接左连接)
左边全部显示,右边没有匹配的NULL代替select a.*,b.* from t_a a left outer join t_b b on a.name=b.name;
结果
a.name a.num b.name b.nick |
4.join右外连接(右连接)
右边全部显示,左边没有匹配的NULL代替select a.*,b.* from t_a a right outer join t_b b on a.name=b.name;
结果a.name a.num b.name b.nick
a 1 a xx
b 2 b yy
d 4 d zz
NULL NULL e pp
5.全外连接select a.*,b.* from t_a a full outer join t_b b on a.name=b.name;
a.name a.num b.name b.nick |
6.左半连接( left semi , hive特有)
left semi join 子句中右边的表只能在 ON 子句中设置过滤条件,
在 WHERE 子句、SELECT 子句或其他地方过滤都不行。
select a.* from t_a a left semi join t_b b on a.name=b.name; |
注意: left semi join的 select子句中,不能有右表的字段
用来查询b表里面又存在a表的数据
a.name a.num |
用来查询b表里面又存在a表的数据
upper函数(小写变大写)
select upper('abc'); |
0: jdbc:hive2://master:10000> select ip,upper(url) access_time from t_log; |
针对每一行进行运算select ip,upper(url),access_time from t_log;
– 该表达式是对数据中的每一行进行逐行运算。0: jdbc:hive2://master:10000> select ip,upper(url),access_time from t_log;
+----------------+-----------------------------+----------------------+--+
| ip | _c1 | access_time |
+----------------+-----------------------------+----------------------+--+
| 192.168.33.3 | HTTP://WWW.APARKE.CN/STU | 2019-03-04 15:30:20 |
| 192.168.33.3 | HTTP://WWW.APARKE.CN/TEACH | 2019-03-04 15:35:20 |
| 192.168.33.4 | HTTP://WWW.APARKE.CN/STU | 2019-03-04 15:30:20 |
| 192.168.33.4 | HTTP://WWW.APARKE.CN/JOB | 2019-03-04 16:30:20 |
| 192.168.33.5 | HTTP://WWW.APARKE.CN/JOB | 2019-03-04 15:40:20 |
| 192.168.33.3 | HTTP://WWW.APARKE.CN/STU | 2019-03-05 15:30:20 |
| 192.168.44.3 | HTTP://WWW.APARKE.CN/TEACH | 2019-03-05 15:35:20 |
| 192.168.33.44 | HTTP://WWW.APARKE.CN/STU | 2019-03-05 15:30:20 |
| 192.168.33.46 | HTTP://WWW.APARKE.CN/JOB | 2019-03-05 16:30:20 |
| 192.168.33.55 | HTTP://WWW.APARKE.CN/JOB | 2019-03-05 15:40:20 |
+----------------+-----------------------------+----------------------+--+
10 rows selected (0.165 seconds)
求每条(组)URL的访问次数(–为注释语法)select url,count(1) from t_log group by url;
--该表达式是对分好组的数据进行逐组运算
+-----------------------------+------+--+ |
可以给统计出来的次数加个字段(as)
select url,count(1) as numbers from t_log group by url; |
+-----------------------------+----------+--+ |
求每个url的访问者中ip地址最大的select url,max(ip) from t_log group by url;
+-----------------------------+----------------+--+ |
求每个用户访问同一个页面访问时间最晚的一条select ip,url,max(access_time) from t_log group by ip,url;
+----------------+-----------------------------+----------------------+--+ |
分组聚合综合实例
数据
192.168.33.3,http://www.aparke.cn/stu,2019-03-04 15:30:20 |
– 建表映射上述数据create table access_time(ip string,url string,access_time string)
partitioned by (dt string)
row format delimited fields terminated by ',';
– 导入本地数据access_time.txtload data local inpath '/root/hive1/groupby/log.0304' into table access_time partition (dt='2019-03-04');
load data local inpath '/root/hive1/groupby/log.0305' into table access_time partition (dt='2019-03-05');
load data local inpath '/root/hive1/groupby/log.0306' into table access_time partition (dt='2019-03-06');
– 查看表的分区信息show partitions access_time;
dt=2019-03-04 |
–查询求3月4号后访问 “http://www.aparke.cn/job"的总访问次数,及访问者中ip最大的
第一种写法错误,没有对url进行聚合,或者可以将查询的url改为聚合函数 min和max在在这里使用,并不影响逻辑第四五种就是这样实现select dt ,url,count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt having dt >'2019-03-04';
改正为select dt ,url,count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt,url having dt >'2019-03-04';
第二种select dt ,url ,count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt,url having dt >'2019-03-04';--group将条件语句产生的结果再进行过滤
第三种(准备数据时过滤)select dt ,url ,count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job' and dt>'2019-03-04'--准备数据时过滤
group by dt,url; --group将条件语句产生的结果再进行分组
不使用url聚合只用group by dt
第三种select dt ,'http://www.aparke.cn/job' ,count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt having dt >'2019-03-04';
第四种select dt,max(url),count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt having dt >'2019-03-04';
第五种select dt,min(url),count(1),max(ip)
from access_time
where url='http://www.aparke.cn/job'
group by dt having dt >'2019-03-04';
结果均为+-------------+---------------------------+------+----------------+--+
| dt | _c1 | _c2 | _c3 |
+-------------+---------------------------+------+----------------+--+
| 2019-03-05 | http://www.aparke.cn/job | 2 | 192.168.33.55 |
| 2019-03-06 | http://www.aparke.cn/job | 3 | 192.168.33.55 |
+-------------+---------------------------+------+----------------+--+
子查询
–求3月4号以后每天每个页面访问的最大ip及次数select dt,url,max(ip),count(1)
from access_time
where dt > '2019-03-04'
group by dt,url;
+-------------+---------------------------------+----------------+------+--+ |
–求3月4号以后每天每个页面访问的访问次数及最大ip,并且,只查询处总访问次数>2的记录select dt,url,max(ip),count(1) as numbers
from access_time
where dt >'2019-03-04'
group by dt,url
having numbers>2;
+-------------+---------------------------------+----------------+----------+--+ |
–使用子查询(括号里面的为某一个查询作为一个表)
select dt,url,numbers,max_ip from |
+-------------+---------------------------------+----------+----------------+--+ |