数据处理常用代码

 

 

pandas 分段读取csv

csv_data = pd.read_csv("./train.csv",converters={'id': str},iterator=True)
loop=True

while loop:
    try:
        chunk = csv_data.get_chunk(1000000)
        #

    except StopIteration:
        loop = False
        print("Iteration is stopped.")

df.to_csv('train.csv',header=None,mode='a',index=0)#header=None不保存头,'a'添加到train.csv
                                        #index=0不保存索引



csv_data = pd.read_csv("./train_pos.csv",converters={'id': str})#用于处理id被视为科学记数法
csv_data = pd.read_table('train_0.txt',header=None,sep='\t',names='user_id','item_id','rating'])
#读取txt
df = chunk[chunk.click.isin([1])]#筛选click 是1的行

df['date'] = df.to_datetime(df['date'])
df_sort=df.sort_values('date')  #按日期排序

df.fillna('-1',inplace=True )#用'-1'替换表中的Nan

df.groupby([col_name]).size()['-1']#计算col_name列中'-1'的个数(-1不存在,会报错)

df.columns.values  #取得列名字

df.to_csv('df.csv',header=None,index=0)#保存csv文件

df[col_name].value_counts()#统计本列每个数出现的次数

pd_=pd.merge(review_pos_sort,user)#连接两个表

重新编号

def renumber(csv_data,col_list):


    for re in col_list:
        df_site_id=csv_data[re]
        slist={}
        cur_index=0
        for i in range(df_site_id.shape[0]):
            sid=df_site_id.iat[i]

            if sid not in slist:
                slist[sid]=cur_index
                cur_index+=1

        csv_data[re]=csv_data[re].map(lambda x:slist[x])

将特征出现次数少于20的特征赋值为-1

def deal_feature(col_list,csv_data):

    for re in col_list:
        df=csv_data[re]
        pos_list={}
        for i in range(df.shape[0]):
            sid=df.iat[i]

            if sid in pos_list:
                pos_list[sid].append(sid)
            else:
                pos_list[sid]=[sid]
        cur_map={}
        for i in pos_list:
            if len(pos_list[i])>=20:
                cur_map[i]=i
            else:
                cur_map[i]=-1
        csv_data[re]=csv_data[re].map(lambda x:cur_map[x])
        print(re)

输出每列的最值

lis=['stars','postal_code','state','city','business_id']
for i in lis:
    print('%s[%d][%d]'%(i,business[i].max(),business[i].min()))

将所有的user,item统一编号

col_list_user=[ 'device_id', 'device_ip']
col_list_item=['C1', 'banner_pos', 'site_id']
user_len=[ 903, 34984, 3785, 5, 4, 1649]
item_len=[7, 7, 1591, 1453, 19]
total=0
for i in range(len(user_len)-1):
    total+=user_len[i]
    train_save[col_list_user[i+1]]=train_save[col_list_user[i+1]].apply(lambda x : x+total)
    test_save[col_list_user[i+1]]=test_save[col_list_user[i+1]].apply(lambda x : x+total)

将count映射为1-n

map_u={}
for i in range(200000):
    if i <=9:
        map_u[i]=0
    elif i<=100:
        map_u[i]=1
    elif i<=1000:
        map_u[i]=2
    elif i<=10000:
        map_u[i]=3
    else :map_u[i]=4

col=['review_count','useful','funny','cool','fans']
for i in col:
    user[i]=user[i].apply(lambda x : map_u[x])

删除交互小于n的行

def get_index_inter_less_n(review,n):
    u=review['user_id'].value_counts()
    df=review['user_id']
    res_index=[]
    for i in range(review.shape[0]):
        cid=df.iat[i]
        if u[cid] >=n:
            res_index.append(i)
    review=review.iloc[res_index]

删除u,i重复的行

def remove_u_i(review_pos_sort):
    review_pos_sort["newColumn"] = review_pos_sort["user_id"].map(str) +'-'+ review_pos_sort["business_id"].map(str)
    c=review_pos_sort['newColumn'].value_counts()

    df=review_pos_sort["newColumn"]
    res_index=[]
    for i in range(df.shape[0]):
        id=df.iat[i]
        if c[id]>=1:
            res_index.append(i)
            c[id]=-1
    review_pos_sort=review_pos_sort.iloc[res_index]

划分出test,按时间排序,将user看过的最后一个item作为测试

def get_train_test(review_pos_sort)
    df_map={}
    df=review_pos_sort['user_id']
    for i in tqdm(range(df.shape[0])):
        id=df.iat[i]
        if id in df_map:
            df_map[id].append(i)
        else:
            df_map[id]=[i]
    
    result_train=[]
    result_test=[]
    for i in tqdm(df_map):
        cur_list=df_map[i][:-1]
        result_train.extend(cur_list)
        result_test.append(df_map[i][-1])
    df_train = review_pos_sort.iloc[result_train]
    df_test = review_pos_sort.iloc[result_test]

    df_train=df_train.sort_values('user_id')
    df_test=df_test.sort_values('user_id')
    return df_train,df_test


 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

你可能感兴趣的:(数据处理常用代码)