问题描述:业务系统的会员信息表进入到数仓中,存在数据缺失,其造成这一问题的原因是后台会手动处理一些数据(卸数等ETL流程不存在问题),造成卸数时取不到这些数据,经过考虑,决定用Python解决这一问题。
以下代码中设计到的一些比较有用的函数及问题:
1、比较两个list,取出一个list不存在于另一个list中的值
final=set(data_userid).difference(set(data_memberid))
final_list=list(final)
2、将数据框dataframe写入数据库,用以下方法很容易实现
engine=create_engine('oracle://pdm:dwPDM2018#@192.168.0.72:1521/pdm', echo=True)
data.convert_objects(convert_numeric=True).to_sql('t01_e3_member_info_lxh', con=engine, if_exists='replace', index=False, index_label=None)
3、当利用Python执行insert into tablename1 select * from tablename2语句时,若tablename2中存在空数据,若使用import pandas.io.sql as sql;sql.read_sql(sqlstr1,conn_oracle)时,会报错TypeError: 'NoneType' object is not iterable。解决办法:使用curs.execute (sqlstr1)游标来执行sql,就会轻而易举的解决此问题。
4、在使用pymsql读取MySQL数据库中的表时,如果sql中含有from_unixtime(shipping_time_ck, '%Y-%m-%d')形式的语句,直接读取会报以下错误:
query = query % self._escape_args(args, conn)
ValueError: unsupported format character 'Y' (0x59) at index 102
解决办法:将from_unixtime(shipping_time_ck, '%Y-%m-%d')改为from_unixtime(shipping_time_ck, '%%Y-%%m-%%d'),改为两个%,问题解决。
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 14 14:52:28 2019
@author: liuxiaohuan
"""
import os
import cx_Oracle
import MySQLdb
import pandas.io.sql as sql
import numpy as np
import pymysql
from sqlalchemy import create_engine
#数据库表中有中文时,需要以下代码
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
#读取pdm中E3用户表,data_memberid为数仓中所有用户ID
conn_oracle = cx_Oracle.connect('pdm/[email protected]/pdm')
sqlstr1='''select distinct memberid from t01_e3_member_info where trd_dt=trunc(sysdate-1)'''
memberid=sql.read_sql(sqlstr1,conn_oracle)
memberid_arry= np.array(memberid)#np.ndarray()
memnerid_list=memberid_arry.tolist()#list
data_memberid=[]
for m in memnerid_list:
m=str(m[0])
data_memberid.append(m)
##############################################################################################################
#读取E3的用户表,data_userid为E3中用户ID
sqlstr2='''select distinct user_id from users where from_unixtime (reg_time, '%Y-%m-%d' )=date_format(DATE_SUB(curdate(),INTERVAL 1 DAY) , '%Y-%m-%d')'''
conn_mysql=MySQLdb.connect("154.167.20.789","asjgf","adush","we",charset='utf8')
curs=conn_mysql.cursor()
curs.execute(sqlstr2)
row=curs.fetchall()
print(type(row))
user_id=list(row)
data_userid=[]
for i in row:
i=list(i)
new=str(i[0])
data_userid.append(new)
######################################################################################################################
#数仓和E3比较,数仓中缺省了哪些memberID,其中final_list为缺省的memberID
final=set(data_userid).difference(set(data_memberid))
final_list=list(final) #final_list为数仓缺失的memberid
####################################################################################################################
#以下代码取出缺省用户ID的所有字段信息
pymysql.install_as_MySQLdb()
HOST = '154.167.20.789'
DB = 'we'
PORT = 3306
USER = 'asjgf'
PASSWORD = 'adush'
engine = create_engine("mysql://{}:{}@{}:{}/{}?charset=utf8".format(USER, PASSWORD, HOST, PORT, DB))
sqlstr3 = '''select * from users where from_unixtime (reg_time, '%%Y-%%m-%%d' )=date_format(DATE_SUB(curdate(),INTERVAL 1 DAY) , '%%Y-%%m-%%d')'''
result = sql.read_sql(sqlstr3,engine)
data=result[result['user_id'].isin(final_list)] #data为从原表中取出缺失的数据
#data.to_csv('E:\\aa.txt', sep='\t', index=False,header=False)
#####################################################################################################################
#将缺失数据写入临时表中
engine=create_engine('oracle://dsjfhjs#@152.456.0.95:1521/pdm', echo=True)
data.convert_objects(convert_numeric=True).to_sql('t01_e3_member_info_lxh', con=engine, if_exists='replace', index=False, index_label=None)
#将缺失数据插入数仓的用户表中
curs=conn_oracle.cursor()
sqlstr_a='''
insert into t01_e3_member_info
select user_id ,
lylx ,
sd_id ,
ncm_fxs_id ,
user_rank ,
to_char(substr(user_name,1,255)) ,
to_char(substr(nick_name,1,255)) ,
to_char(substr(email,1,255)) ,
to_char(substr(buyer_alipay_no,1,600)) ,
sex ,
to_char(substr(password,1,255)) ,
to_char(substr(question,1,4000)) ,
to_char(substr(answer,1,4000)) ,
to_char(substr(birthday,1,255)) ,
user_money ,
frozen_money ,
rank_points ,
paid_money ,
paid_count ,
last_paid ,
reg_time ,
credit_rank ,
last_login ,
last_time ,
to_char(substr(last_ip,255)) ,
visit_count ,
is_special ,
is_potential ,
is_koubei ,
is_warn ,
parent_id ,
to_char(substr(alias,1,255)) ,
to_char(substr(msn,1,255)) ,
to_char(substr(wangwang,1,255)) ,
to_char(substr(qq,1,100)) ,
to_char(substr(office_phone,1,100)) ,
home_phone ,
mobile_phone ,
status ,
is_unpopular ,
is_auto_rank ,
comment_nums ,
good_comment_nums ,
is_black ,
to_char(substr(bz,1,4000)) ,
mjyx ,
mjly ,
to_char(substr(gmph,1,255)) ,
is_by ,
is_zhgy ,
latest_hyyxd_time ,
is_icrm ,
os_user_id ,
sync_bstyle ,
trunc(sysdate-1) as trd_dt,
trunc(sysdate) as load_dt
from T01_E3_MEMBER_INFO_LXH
'''
#插入历史表
sqlstr_h='''
insert into t01_e3_member_info_h
select user_id ,
mobile_phone ,
trunc(sysdate-1) as trd_dt,
trunc(sysdate) as load_dt
from T01_E3_MEMBER_INFO_LXH
'''
curs.execute (sqlstr_a)
curs.execute (sqlstr_h)
conn_oracle.commit()