pandas 满足多条件的行的某列求和

import numpy as np
import pandas as pd
import csv
import random
# train_msg = pd.read_csv('../../txCTR/train_msg.csv')
# train_msg = train_msg.sort_values(by='user_id', ascending=True)  # 按照特定列排序,如果没有重新赋值,原数据不会改变
# train_msg_train = train_msg[train_msg.user_id <= 100000]    # 按照单单个条件选择行
# train_msg_test = train_msg[train_msg.user_id > 100000] 
# train_msg_train.to_csv('../../txCTR/train_msg_train.csv',index=False,sep=',')
# train_msg_test.to_csv('../../txCTR/train_msg_test.csv',index=False,sep=',')

#下面一段代码
# simulation = pd.read_csv('../../txCTR/test/try/simulation.csv')
# simulation = simulation[(simulation.user_id == 1) & (simulation.industry == 1) & (simulation.category == 2)]  #里面()不能丢
# print(simulation)
# times = simulation['click_times'].sum()  
# print(times)

# 下面这几行代码极其高效地统计了各性别、年龄以及industry上面的分布,统计各类gender、age、industry组合出现的次数
# train_msg = pd.read_csv('../../txCTR/train_msg.csv')
# # train_msg = train_msg.sort_values(by='user_id', ascending=True)
# count = train_msg.groupby(['gender', 'age', 'industry']).size().reset_index(name="Time")  # 此处如果不为统计列重命名,
# # 则保存的csv文件只有最后一列,且没有列标题
# print(count)
# count.to_csv("../../txCTR/statistics/try/industry/gender_age_industry_count.csv", index=False, sep=',')

 

你可能感兴趣的:(数据统计)