pandas 分段读取csv
csv_data = pd.read_csv("./train.csv",converters={'id': str},iterator=True)
loop=True
while loop:
try:
chunk = csv_data.get_chunk(1000000)
#
except StopIteration:
loop = False
print("Iteration is stopped.")
df.to_csv('train.csv',header=None,mode='a',index=0)#header=None不保存头,'a'添加到train.csv
#index=0不保存索引
csv_data = pd.read_csv("./train_pos.csv",converters={'id': str})#用于处理id被视为科学记数法
csv_data = pd.read_table('train_0.txt',header=None,sep='\t',names='user_id','item_id','rating'])
#读取txt
df = chunk[chunk.click.isin([1])]#筛选click 是1的行
df['date'] = df.to_datetime(df['date'])
df_sort=df.sort_values('date') #按日期排序
df.fillna('-1',inplace=True )#用'-1'替换表中的Nan
df.groupby([col_name]).size()['-1']#计算col_name列中'-1'的个数(-1不存在,会报错)
df.columns.values #取得列名字
df.to_csv('df.csv',header=None,index=0)#保存csv文件
df[col_name].value_counts()#统计本列每个数出现的次数
pd_=pd.merge(review_pos_sort,user)#连接两个表
重新编号
def renumber(csv_data,col_list):
for re in col_list:
df_site_id=csv_data[re]
slist={}
cur_index=0
for i in range(df_site_id.shape[0]):
sid=df_site_id.iat[i]
if sid not in slist:
slist[sid]=cur_index
cur_index+=1
csv_data[re]=csv_data[re].map(lambda x:slist[x])
将特征出现次数少于20的特征赋值为-1
def deal_feature(col_list,csv_data):
for re in col_list:
df=csv_data[re]
pos_list={}
for i in range(df.shape[0]):
sid=df.iat[i]
if sid in pos_list:
pos_list[sid].append(sid)
else:
pos_list[sid]=[sid]
cur_map={}
for i in pos_list:
if len(pos_list[i])>=20:
cur_map[i]=i
else:
cur_map[i]=-1
csv_data[re]=csv_data[re].map(lambda x:cur_map[x])
print(re)
输出每列的最值
lis=['stars','postal_code','state','city','business_id']
for i in lis:
print('%s[%d][%d]'%(i,business[i].max(),business[i].min()))
将所有的user,item统一编号
col_list_user=[ 'device_id', 'device_ip']
col_list_item=['C1', 'banner_pos', 'site_id']
user_len=[ 903, 34984, 3785, 5, 4, 1649]
item_len=[7, 7, 1591, 1453, 19]
total=0
for i in range(len(user_len)-1):
total+=user_len[i]
train_save[col_list_user[i+1]]=train_save[col_list_user[i+1]].apply(lambda x : x+total)
test_save[col_list_user[i+1]]=test_save[col_list_user[i+1]].apply(lambda x : x+total)
将count映射为1-n
map_u={}
for i in range(200000):
if i <=9:
map_u[i]=0
elif i<=100:
map_u[i]=1
elif i<=1000:
map_u[i]=2
elif i<=10000:
map_u[i]=3
else :map_u[i]=4
col=['review_count','useful','funny','cool','fans']
for i in col:
user[i]=user[i].apply(lambda x : map_u[x])
删除交互小于n的行
def get_index_inter_less_n(review,n):
u=review['user_id'].value_counts()
df=review['user_id']
res_index=[]
for i in range(review.shape[0]):
cid=df.iat[i]
if u[cid] >=n:
res_index.append(i)
review=review.iloc[res_index]
删除u,i重复的行
def remove_u_i(review_pos_sort):
review_pos_sort["newColumn"] = review_pos_sort["user_id"].map(str) +'-'+ review_pos_sort["business_id"].map(str)
c=review_pos_sort['newColumn'].value_counts()
df=review_pos_sort["newColumn"]
res_index=[]
for i in range(df.shape[0]):
id=df.iat[i]
if c[id]>=1:
res_index.append(i)
c[id]=-1
review_pos_sort=review_pos_sort.iloc[res_index]
划分出test,按时间排序,将user看过的最后一个item作为测试
def get_train_test(review_pos_sort)
df_map={}
df=review_pos_sort['user_id']
for i in tqdm(range(df.shape[0])):
id=df.iat[i]
if id in df_map:
df_map[id].append(i)
else:
df_map[id]=[i]
result_train=[]
result_test=[]
for i in tqdm(df_map):
cur_list=df_map[i][:-1]
result_train.extend(cur_list)
result_test.append(df_map[i][-1])
df_train = review_pos_sort.iloc[result_train]
df_test = review_pos_sort.iloc[result_test]
df_train=df_train.sort_values('user_id')
df_test=df_test.sort_values('user_id')
return df_train,df_test