发现一个不错的网站,https://ourworldindata.org/covid-vaccinations, 可以获取世界各国接种疫苗的最新信息,最后面有一个表格,记录哪个国家接种哪些疫苗,感觉挺不错的,于是发挥了python功底,获取这个表格的内容,写入mysql,代码如下:
from html.parser import HTMLParser
import requests
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.in_td = False
self.in_th = False
self.data = [] #所有数据
self.line = [] #行数据
self.header=[] #标题列表
def handle_starttag(self, tag, attrs):
if tag=='th':self.in_th = True
else:self.in_th = False
if tag=='td':self.in_td = True
else:self.in_td = False
if tag=='tr':self.line=[] #开始存储一行的数据
def handle_endtag(self, tag):
if tag=='td':self.in_td=False
if tag=='th':self.in_th=False
if tag=='tr' and self.line: #收行尾
self.data.append(self.line) #行数据打包
def handle_data(self, data):
data=data.strip()
if self.in_th==True: #标题
self.header.append(data)
if self.in_td==True: #表格
self.line.append(data)
if __name__=='__main__':
res=requests.get('https://ourworldindata.org/covid-vaccinations#attitudes-to-covid-19-vaccinations')
res.encoding='utf-8'
cnt=res.text
table=cnt.split('')[1].split("
")[0]
table=table.replace('(see','')
table=table.replace('Oxford/AstraZeneca','阿斯利康').replace('Sputnik V','卫星五').replace('Sinovac','科兴').replace('Covaxin','巴拉特')
table=table.replace('Pfizer/BioNTech','辉瑞').replace('Moderna','莫德纳').replace("Johnson&Johnson",'强生')
table=table.replace('Sinopharm/Beijing','国药北京').replace('Sinopharm/Wuhan','国药武汉').replace('CanSino','康希诺')
table=table.replace('','').replace('','')
parser = MyHTMLParser()
parser.feed(table) #填入文件分析
data=parser.data #表格内容
header=parser.header #表格标题
header.remove('Source')
conn=create_engine("mysql+pymysql://root:[email protected]:3306/economy")
df=pd.DataFrame(parser.data,columns=parser.header)
dfc=pd.read_sql("select country,country_cn from countrylist where iscountry='Y'",conn)
countrymap=dict(zip(dfc['country'],dfc['country_cn']))
df['Location']=df['Location'].apply(lambda x:countrymap[x] if x in countrymap else x).astype(str)
df.drop('Last observation date',axis=1,inplace=True)
#遍历每一行更新
dfx=pd.read_sql("select Location from vaccine where fixable='Y'",conn)
fixlist=dfx['Location'].values
for i in df.index:
Location=df.loc[i,'Location']
Vaccines=df.loc[i,'Vaccines']
if Location in fixlist: #如果可修改,因为有一些不可修改,属于自己纠正数据,不受ourdata影响
sql="update vaccine set Vaccines='{}' where Location='{}'".format(Vaccines,Location)
conn.execute(sql)
print(sql)
# dtypedict = {'Location':VARCHAR(120),'Vaccines':VARCHAR(200)}
# df.to_sql(name='vaccine',con=conn,if_exists='append',index=False,dtype=dtypedict)