PD操作

# -*- coding: utf-8 -*- 
# @Time     : 2018/7/6 17:10 
# @Author   : 
# @File     : 
# @Description  : 

import pandas as pd
import pymongo

pd.set_option('expand_frame', False)

MONGO_URI = 'mongodb://XXX:XXX'
MONGO_DB = 'LogData'
MONGO_TABLE = 'LogData'

client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]


def get_data_from_queue(queueName):
    data = pd.DataFrame(list(db[MONGO_TABLE].find({'queueName': queueName})))
    return data


def select_data_from_queue(df, queueName, output_filename):
    print('正在处理queue:', queueName)
    ser = df['Contact_Alias'].notnull().value_counts()
    print('Contact_Alias不为空的数据为:', ser.loc[True])
    print('Contact_Alias为空的数据为:', ser.loc[False])

    # Contact_Alias 不为空的情况
    df_alias_not_null = df[df['Contact_Alias'].notnull()]

    # Contact_Alias 为空, Contact_QuanPin 不以wxid开头的情况
    df_alias_is_null = df[df['Contact_Alias'].isnull() & df['Contact_QuanPin'].notnull()]
    condition = df_alias_is_null['Contact_QuanPin'].apply(lambda x: not x.startswith('wxid'))
    df_auxiliary = df_alias_is_null[condition]
    print('通过Contact_QuanPin新增数据:', len(df_auxiliary))

    df_output = pd.concat([df_alias_not_null, df_auxiliary], axis=0)
    df_output = df_output[['phone', 'Contact_Alias', 'Contact_QuanPin']]

    df_output.reset_index(inplace=True, drop=True)
    num = len(df_output)
    print('最终输出数据条数;', num)
    print()

    fill_values = df_output.loc[(df_output['Contact_Alias'].isnull()), 'Contact_QuanPin'].values
    df_output.loc[(df_output['Contact_Alias'].isnull()), 'Contact_Alias'] = fill_values
    # print(df_output)
    df_output.to_csv('./data/' + output_filename + '_wechat_' + str(num) + '.txt', columns=['phone', 'Contact_Alias'],
                     header=None, index=False, sep='\t')


def run(queueName, output_filename):
    df = get_data_from_queue(queueName)
    select_data_from_queue(df, queueName, output_filename)


if __name__ == '__main__':
    run('qbrq_1000', output_filename='XXXX')

你可能感兴趣的:(PD操作)