hive求日活与日新并脚本化运行

2019-03-18

hive实现日活跃用户统计
hive实现日新增用户统计

假如有一个web系统，每天生成以下日志文件：
需求：
建立一个表，来存储每天新增的数据(分区表)
1.统计每天的活跃用户(日活)（需要用户的ip，用户的账号，用户访问时间最早的一条url和时间）
2.统计每天的新增用户(日新)

--2017-09-15号的数据：
192.168.33.6,hunter,2017-09-15 10:30:20,/a
192.168.33.7,hunter,2017-09-15 10:30:26,/b
192.168.33.6,jack,2017-09-15 10:30:27,/a
192.168.33.8,tom,2017-09-15 10:30:28,/b
192.168.33.9,rose,2017-09-15 10:30:30,/b
192.168.33.10,julia,2017-09-15 10:30:40,/c
--2017-09-16号的数据：
192.168.33.16,hunter,2017-09-16 10:30:20,/a
192.168.33.18,jerry,2017-09-16 10:30:30,/b
192.168.33.26,jack,2017-09-16 10:30:40,/a
192.168.33.18,polo,2017-09-16 10:30:50,/b
192.168.33.39,nissan,2017-09-16 10:30:53,/b
192.168.33.39,nissan,2017-09-16 10:30:55,/a
192.168.33.39,nissan,2017-09-16 10:30:58,/c
192.168.33.20,ford,2017-09-16 10:30:54,/c
--2017-09-17号的数据：
192.168.33.46,hunter,2017-09-17 10:30:21,/a
192.168.43.18,jerry,2017-09-17 10:30:22,/b
192.168.43.26,tom,2017-09-17 10:30:23,/a
192.168.53.18,bmw,2017-09-17 10:30:24,/b
192.168.63.39,benz,2017-09-17 10:30:25,/b
192.168.33.25,haval,2017-09-17 10:30:30,/c
192.168.33.10,julia,2017-09-17 10:30:40,/c

实现日活

1.建表映射日志数据

craete table t_web_log(ip string,uid string,access_time string,url string)
partitioned by (day string)
row format delimited fields terminated by ',';

load data local inpath '/root/hive1/weblog/log.15' into table t_web_log partition(day='2017-09-15');
load data local inpath '/root/hive1/weblog/log.16' into table t_web_log partition(day='2017-09-16');
load data local inpath '/root/hive1/weblog/log.17' into table t_web_log partition(day='2017-09-17');

2. 统计日活跃用户

2.1创建日活表（分区表）

create table t_user_active_day(ip string,uid string,first_access string,url string)
partitioned by (day string);

2.2打标记 row_number()over最后通过where筛选只保留每个用户第一条即可

select ip,uid,access_time,url
from
(select ip,uid,access_time,url,
row_number() over(partition by uid order by access_time) as rn  --打标记 row_number()over最后通过where筛选只保留每个用户第一条即可
from t_web_log
where day='2017-09-15') tmp
where rn=1;  --where rn=1 只保留每个用户的一条数据;

+----------------+---------+----------------------+------+--+
|       ip       |   uid   |     access_time      | url  |
+----------------+---------+----------------------+------+--+
| 192.168.33.20  | ford    | 2017-09-16 10:30:54  | /c   |
| 192.168.33.16  | hunter  | 2017-09-16 10:30:20  | /a   |
| 192.168.33.26  | jack    | 2017-09-16 10:30:40  | /a   |
| 192.168.33.18  | jerry   | 2017-09-16 10:30:30  | /b   |
| 192.168.33.39  | nissan  | 2017-09-16 10:30:53  | /b   |
| 192.168.33.18  | polo    | 2017-09-16 10:30:50  | /b   |
+----------------+---------+----------------------+------+--+

2.3将2017-09-15插入到新建的日活跃表中去t_user_active_day,保存

insert into table t_user_active_day partition(day='2017-09-15')
select ip,uid,access_time,url
from
(select ip,uid,access_time,url,
row_number() over(partition by uid order by access_time) as rn 
from t_web_log
where day='2017-09-15') tmp
where rn=1;

2.4将2017-09-16插入到新建的日活跃表中去t_user_active_day,保存

insert into table t_user_active_day partition(day='2017-09-16')
select ip,uid,access_time,url
from
(select ip,uid,access_time,url,
row_number() over(partition by uid order by access_time) as rn 
from t_web_log
where day='2017-09-16') tmp
where rn=1;

2.5将2017-09-17插入到新建的日活跃表中去t_user_active_day,保存

insert into table t_user_active_day partition(day='2017-09-17')
select ip,uid,access_time,url
from
(select ip,uid,access_time,url,
row_number() over(partition by uid order by access_time) as rn 
from t_web_log
where day='2017-09-17') tmp
where rn=1;

3.查询结果

select * from t_user_active_day;
+--------------------+----------------------+-------------------------------+----------------------+----------------------+--+
|t_user_active_day.ip| t_user_active_day.uid| t_user_active_day.first_access| t_user_active_day.url| t_user_active_day.day|
+--------------------+----------------------+-------------------------------+----------------------+----------------------+--+
|192.168.33.6        | hunter               | 2017-09-15 10:30:20           | /a                   | 2017-09-15           |
|192.168.33.6        | jack                 | 2017-09-15 10:30:27           | /a                   | 2017-09-15           |
|192.168.33.10       | julia                | 2017-09-15 10:30:40           | /c                   | 2017-09-15           |
|192.168.33.9        | rose                 | 2017-09-15 10:30:30           | /b                   | 2017-09-15           |
|192.168.33.8        | tom                  | 2017-09-15 10:30:28           | /b                   | 2017-09-15           |
|192.168.33.20       | ford                 | 2017-09-16 10:30:54           | /c                   | 2017-09-16           |
|192.168.33.16       | hunter               | 2017-09-16 10:30:20           | /a                   | 2017-09-16           |
|192.168.33.26       | jack                 | 2017-09-16 10:30:40           | /a                   | 2017-09-16           |
|192.168.33.18       | jerry                | 2017-09-16 10:30:30           | /b                   | 2017-09-16           |
|192.168.33.39       | nissan               | 2017-09-16 10:30:53           | /b                   | 2017-09-16           |
|192.168.33.18       | polo                 | 2017-09-16 10:30:50           | /b                   | 2017-09-16           |
|192.168.63.39       | benz                 | 2017-09-17 10:30:25           | /b                   | 2017-09-17           |
|192.168.53.18       | bmw                  | 2017-09-17 10:30:24           | /b                   | 2017-09-17           |
|192.168.33.25       | haval                | 2017-09-17 10:30:30           | /c                   | 2017-09-17           |
|192.168.33.46       | hunter               | 2017-09-17 10:30:21           | /a                   | 2017-09-17           |
|192.168.43.18       | jerry                | 2017-09-17 10:30:22           | /b                   | 2017-09-17           |
|192.168.33.10       | julia                | 2017-09-17 10:30:40           | /c                   | 2017-09-17           |
|192.168.43.26       | tom                  | 2017-09-17 10:30:23           | /a                   | 2017-09-17           |
+--------------------+----------------------+-------------------------------+----------------------+----------------------+--+

实现日新

1.设计历史用户表

只保留用户id即可
create table t_user_history(uid string);

2.设计新用户表

create table t_user_new_day like t_user_active_day;

实现思想：
1.将当日活跃用户跟历史用户表关联，找出哪些在历史用户表尚不存在的用户
2.将关联的新用户表插入到当日新用户
3.将当日用户表插入当历史用户表中
4.完了还要将当日的用户表插入到历史用户表中去

步骤

1.当日活跃用户跟历史用户表关联

将当日活跃用户跟历史用户表关联，找出哪些在历史用户表尚不存在的用户
凡是连不上的 b.uid为空的为新用户

select a.ip,a.uid,a.first_access,a.url,b.uid
from t_user_active_day a
left join t_user_history b on a.uid=b.uid
where a.day='2017-09-15';
+----------------+---------+----------------------+--------+--------+--+
|      a.ip      |  a.uid  |    a.first_access    | a.url  | b.uid  |
+----------------+---------+----------------------+--------+--------+--+
| 192.168.33.6   | hunter  | 2017-09-15 10:30:20  | /a     | NULL   |
| 192.168.33.6   | jack    | 2017-09-15 10:30:27  | /a     | NULL   |
| 192.168.33.10  | julia   | 2017-09-15 10:30:40  | /c     | NULL   |
| 192.168.33.9   | rose    | 2017-09-15 10:30:30  | /b     | NULL   |
| 192.168.33.8   | tom     | 2017-09-15 10:30:28  | /b     | NULL   |
+----------------+---------+----------------------+--------+--------+--+

2.求出15号的新用户直接插入到日新用户表中（查询的结果因为历史用户都为空，所以与上面结果相同）

insert into table t_user_new_day partition(day='2017-09-15')            
select ip,uid,first_access,url
from
(
select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid
from t_user_active_day a
left join t_user_history b on a.uid=b.uid
where a.day='2017-09-15') tmp
where tmp.b_uid is null;

3.将15号的新用户插入历史表

insert into table t_user_history
select uid
from t_user_new_day
where day='2017-09-15';

4.求出16号的新用户

insert into table t_user_new_day partition(day='2017-09-16')            
select ip,uid,first_access,url
from
(
select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid
from t_user_active_day a
left join t_user_history b on a.uid=b.uid
where a.day='2017-09-16') tmp
where tmp.b_uid is null;

5.将16号的新用户插入历史表

insert into table t_user_history
select uid
from t_user_new_day
where day='2017-09-16';

6.求出17号的新用户

insert into table t_user_new_day partition(day='2017-09-17')            
select ip,uid,first_access,url
from
(
select a.ip,a.uid,a.first_access,a.url,b.uid as b_uid
from t_user_active_day a
left join t_user_history b on a.uid=b.uid
where a.day='2017-09-17') tmp
where tmp.b_uid is null;

7.将17号的新用户插入历史表

insert into table t_user_history
select uid
from t_user_new_day
where day='2017-09-17';

查询历史表

select * from t_user_history
+---------------------+--+
| t_user_history.uid  |
+---------------------+--+
| hunter              |
| jack                |
| julia               |
| rose                |
| tom                 |
| ford                |
| jerry               |
| nissan              |
| polo                |
| benz                |
| bmw                 |
| haval               |
+---------------------+--+

查询日新表

select * from t_user_new_day;
+--------------------+---------------------+------------------------------+---------------------+---------------------+--+
| t_user_new_day.ip  | t_user_new_day.uid  | t_user_new_day.first_access  | t_user_new_day.url  | t_user_new_day.day  |
+--------------------+---------------------+------------------------------+---------------------+---------------------+--+
| 192.168.33.6       | hunter              | 2017-09-15 10:30:20          | /a                  | 2017-09-15          |
| 192.168.33.6       | jack                | 2017-09-15 10:30:27          | /a                  | 2017-09-15          |
| 192.168.33.10      | julia               | 2017-09-15 10:30:40          | /c                  | 2017-09-15          |
| 192.168.33.9       | rose                | 2017-09-15 10:30:30          | /b                  | 2017-09-15          |
| 192.168.33.8       | tom                 | 2017-09-15 10:30:28          | /b                  | 2017-09-15          |
| 192.168.33.20      | ford                | 2017-09-16 10:30:54          | /c                  | 2017-09-16          |
| 192.168.33.18      | jerry               | 2017-09-16 10:30:30          | /b                  | 2017-09-16          |
| 192.168.33.39      | nissan              | 2017-09-16 10:30:53          | /b                  | 2017-09-16          |
| 192.168.33.18      | polo                | 2017-09-16 10:30:50          | /b                  | 2017-09-16          |
| 192.168.63.39      | benz                | 2017-09-17 10:30:25          | /b                  | 2017-09-17          |
| 192.168.53.18      | bmw                 | 2017-09-17 10:30:24          | /b                  | 2017-09-17          |
| 192.168.33.25      | haval               | 2017-09-17 10:30:30          | /c                  | 2017-09-17          |
+--------------------+---------------------+------------------------------+---------------------+---------------------+--+

脚本化实现日活用户脚本化入库ETL

模拟拿到数据后的第二天凌晨处理

#!/bin/bash
#自动化脚本应该设置为动态时间
#第一步先将系统时间修改为数据处理时间,重启后自动恢复 date -s '2017-09-15'
#设置时间变量并解析
#day_str="2017-09-15"
#echo "准备处理$day_str 的数据..."
#在数据库weblog创建了已经t_user_active_day_shell表

#date -s '2017-09-16'  #模拟拿到数据后的第二天凌晨处理
date -s $1 #模拟传进来时间
day_str=`date -d '-1 day' +'%Y-%m-%d'`  #前一天的数据
echo "准备处理$day_str 的数据.."

HQL_user_active_day="

insert into table weblog.t_user_active_day_shell partition(day=\"$day_str\")
select ip,uid,access_time,url
from
(select ip,uid,access_time,url,
row_number() over(partition by uid order by access_time) as rn 
from weblog.t_web_log
where day=\"$day_str\") tmp
where rn=1;  
"
echo "executing sql..."
echo $HQL_user_active_day
hive -e "$HQL_user_active_day"

执行shell

模拟拿到数据后的第二天凌晨处理
./elt_weblog.sh '2017-09-16'
./elt_weblog.sh '2017-09-17'
./elt_weblog.sh '2017-09-18'