create table cite(citing int,cited int) row format deliminted fields terminated by ',' stored as textfile; load data inpath '/tmp/file1.txt' overwrite into table cite; select * from cite limit 10; create table cite_count(cited int ,count int); insert overwrite table cite_count select cited,count(citing) from cite group by cited; select * from cite_count limit 10; alter table cited_count rename to cite_count; create table cite_num(cited_count int,num int) clustered by (cited_count) sorted by (cited_count desc) into 10 buckets; insert overwrite table cite_num select count,count(cited) from cite_count group by count; # 引用次数最多的10个专利 select * from cite_count sort by count desc limit 10 #创建被引用专利唯一数据表 create table cited_unique(cited int) #写值 insert overwrite table cited_unique select distinct cited from cite; #查询有多少个专利被引用 select max(cited) from cited_unique #创建表 create table apat_one(patent int ,GYEAR String,GDATE String,APPYEAR String,COUNTRY String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string) CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS row format delimited fields terminated by ',' STORED AS textfile; #加载数据 load data inpath '/patent/production/input/apat63_99.txt' overwrite into table apat; select * from apat limit 100; 问题:load data 时,怎么样做到自动分区和分桶? #统计每个国家的专利数 create table country_apat_count as select country ,count(1) num from apat group by country #国家的信息中有',使用函数去掉 create table country_apat_count as select regexp_replace(country,'\"','' ) ,count(1) num from apat group by regexp_replace(country,'\"','' ) alter table country_apat_count replace columns (country string,num bigint) # 修改表 alter table country_apat_count rename to country_apat_num; #查询前10条记录 select * from country_apat_count limit 10; set hive.exec.dynamic.partition = true; create table apat_one(patent int,GYEAR string, GDATE String,APPYEAR String,POSTATE String,ASSIGNEE String,ASSCODE String,CLAIMS string,NCLASS String,CAT String,SUBCAT string,CMADE string,CRECEIVE string,RATIOCIT string,GENERAL string,ORIGINAL string,FWDAPLAG string,BCKGTLAG string,SELFCTUB string,SELFCTLB string,SECDUPBD string,SECDLWBD string) partitioned by (COUNTRY String) CLUSTERED BY(patent) SORTED BY(patent) INTO 32 BUCKETS STORED AS textfile insert overwrite table apat_one partition (COUNTRY) select patent,GDATE,GYEAR,APPYEAR,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD,regexp_replace(COUNTRY,'\"','') from apat #partent,GYEAR,GDATE,APPYEAR,COUNTRY,POSTATE,ASSIGNEE,ASSCODE,CLAIMS,NCLASS,CAT,SUBCAT,CMADE,CRECEIVE,RATIOCIT,GENERAL,ORIGINAL,FWDAPLAG,BCKGTLAG,SELFCTUB,SELFCTLB,SECDUPBD,SECDLWBD #example #INSERT OVERWRITE TABLE T PARTITION (ds, hr) #SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10; # trim 函数的使用 select trim(' abc ') from country_apat_num limit 1; #regexp_replace 函数的使用,替换“ 成空 select regexp_replace(country,'\"','') from country_apat_num limit 5;