python+SQL group by后排序取最大值/最小值

取根据时间最小值的行

# 这样得到的不是按照ts最小排序的第一行数据
# grouped=data.groupby(['t1.date','t1.imei','address']).head(1)  #43626行
grouped=data.groupby(['t1.date','t1.imei']).apply(lambda t: t[t.ts==t.ts.min()])
grouped.head()

取出现频次最多的行

# 计算address出现的频次
data['counts'] = 0
data = data.groupby(['t1.date','t1.imei','address'], as_index=False)['counts'].count()
# 取出现频次最多的行
grouped=data.groupby(['t1.date','t1.imei']).apply(lambda t: t[t.counts==t.counts.max()])

只取出现最早的数据

Python

grouped = data['time'].groupby(data['imei'])
mindf = grouped.min().to_frame()   ## 重要~~删除线格式~~ 
mindf = mindf.reset_index()
mindf.head()

SQL - row_number()

-- 活跃 
select date,id,row_number() over(PARTITION BY id ORDER BY date asc) as rank
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by date,id

这样就写错了:
根据date,id进行分组之后,rank肯定都=1了。
所以如果想得到真正的做早date对应的id,应该只对id进行分组。

改正

select date,count(distinct id) as counts from
(select imei from miui_data.lbs0701) t1
inner join
(
-- 活跃 
select date,id,row_number() over(PARTITION BY id ORDER BY date asc) as rank
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by id) t2   -- 重要
on t1.imei = t2.id
where t2.rank = 1
group by date

或者用min():

	select min_date,count(distinct id) as counts from
(select imei from miui_data.lbs0701) t1
inner join
(
-- 活跃 
select min(date) as min_date,id    -- 重点
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by id) t2
on t1.imei = t2.id
group by min_date

Python - 分组后求和

#新建一列
china[‘sum’]=0
#对之前的counts求和,并存在sum这列中
china = china.groupby([‘date’])[‘counts’].sum()

python分组后去重 - 只留下某个最小的值

  1. 先排序,再groupby分组,分组后只保留第一个
right_model_unique = right_model.sort_values(['imei','model','real_model','pn','firstactivedate','lateractivedate'],ascending=[1,1,1,1,1,1])
right_model_unique = right_model_unique.groupby(['imei','model','real_model','pn','firstactivedate']).head(1)

你可能感兴趣的:(python,hive,sql)