# 这样得到的不是按照ts最小排序的第一行数据
# grouped=data.groupby(['t1.date','t1.imei','address']).head(1) #43626行
grouped=data.groupby(['t1.date','t1.imei']).apply(lambda t: t[t.ts==t.ts.min()])
grouped.head()
# 计算address出现的频次
data['counts'] = 0
data = data.groupby(['t1.date','t1.imei','address'], as_index=False)['counts'].count()
# 取出现频次最多的行
grouped=data.groupby(['t1.date','t1.imei']).apply(lambda t: t[t.counts==t.counts.max()])
Python
grouped = data['time'].groupby(data['imei'])
mindf = grouped.min().to_frame() ## 重要~~删除线格式~~
mindf = mindf.reset_index()
mindf.head()
SQL - row_number()
-- 活跃
select date,id,row_number() over(PARTITION BY id ORDER BY date asc) as rank
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by date,id
这样就写错了:
根据date,id进行分组之后,rank肯定都=1了。
所以如果想得到真正的做早date对应的id,应该只对id进行分组。
改正
select date,count(distinct id) as counts from
(select imei from miui_data.lbs0701) t1
inner join
(
-- 活跃
select date,id,row_number() over(PARTITION BY id ORDER BY date asc) as rank
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by id) t2 -- 重要
on t1.imei = t2.id
where t2.rank = 1
group by date
或者用min():
select min_date,count(distinct id) as counts from
(select imei from miui_data.lbs0701) t1
inner join
(
-- 活跃
select min(date) as min_date,id -- 重点
from profile.device_state_accumulator_all
where date >= 20190702
and date <= 20190706
and from_unixtime(cast(substr(lastactivetime,1,10) as int),'yyyyMMdd') = date
and finalCountry="中国"
group by id) t2
on t1.imei = t2.id
group by min_date
#新建一列
china[‘sum’]=0
#对之前的counts求和,并存在sum这列中
china = china.groupby([‘date’])[‘counts’].sum()
right_model_unique = right_model.sort_values(['imei','model','real_model','pn','firstactivedate','lateractivedate'],ascending=[1,1,1,1,1,1])
right_model_unique = right_model_unique.groupby(['imei','model','real_model','pn','firstactivedate']).head(1)