hive 数据清洗大致流程

– 创建hive mock_data管理表

create table mock_data(
uuid STRING,
name STRING,
englishName STRING,
gender STRING,
birth STRING,
identityNum STRING,
nationality STRING,
eduction STRING,
occupation STRING,
title STRING,
phoneNumber STRING,
address STRING,
email STRING,
website STRING,
imsi STRING,
imei STRING,
ip STRING,
macAddress STRING
)row format delimited fields terminated by '\t';

–导入数据到表中

load data local inpath '/home/data.txt' into table mock_data;

–创建相同表结构表存放去重数据

create table mock_data_duplicated like mock_data;

–统计去重的数据量

select count(*) from(
select distinct uuid,name,englishName,gender,birth,identityNum,nationality,eduction,occupation,title,phoneNumber,address,email,website,imsi,imei,ip,macAddress
from mock_data) t;

–将原数据集去重后导入去重表mock_data_duplicated中

insert into table mock_data_duplicated(uuid,name,englishname,gender,birth,identitynum,nationality,eduction,occupation,title,phonenumber,address,email,website,imsi,imei,ip,macaddress) select distinct uuid,name,englishName,gender,birth,identityNum,nationality,eduction,occupation,title,phoneNumber,address,email,website,imsi,imei,ip,macAddress from mock_data;

–查询带有无效邮箱的数据

select 
uuid,name,englishname,gender,birth,identitynum,nationality,eduction,occupation,title,phonenumber,address,regexp_replace(email,'.*','') email ,website,imsi,imei,ip,macaddress
from mock_data_duplicated 
where email not regexp '^[A-Za-zd]+([-_.][A-Za-zd]+)*@([A-Za-zd]+[-.])+[A-Za-zd]{2,5}$';

– 第一步 创建新表mock_data2 导入规整数据

create table mock_data2 as
select * from mock_data_duplicated
where email regexp '^[A-Za-zd]+([-_.][A-Za-zd]+)*@([A-Za-zd]+[-.])+[A-Za-zd]{2,5}$';

– 第二部 导入已处理的数据 到mock_data2中

insert into table mock_data2
select 
uuid,name,englishname,gender,birth,identitynum,nationality,eduction,occupation,title,phonenumber,address,regexp_replace(email,'.*','') email ,website,imsi,imei,ip,macaddress
from mock_data_duplicated 
where email not regexp '^[A-Za-zd]+([-_.][A-Za-zd]+)*@([A-Za-zd]+[-.])+[A-Za-zd]{2,5}$'; 

你可能感兴趣的:(数据清洗,数据去重,数据治理,sql,hive)