叨叨在前:
关于疫情数据获取,早就被做烂了,网上的博客和github也挺多资源。
主要数据源有两类,一个是类似网易、腾讯、支付宝和丁香园等提供的综合数据,第二类是各省市卫健委和国家卫健委。
当然第一类也是来源于第二类的。
另外淘宝也提供疫情数据购买,但我咨询了下,只有各城市,不提供具体到区的数据
反正看自己需求吧
这是两个github项目。
https://github.com/lewangdev/shanghai-lockdown-covid-19
这个是在2022年上海封控期间,作者写的爬虫,爬的是卫健委数据。我下面的代码也主要改写自这个项目
https://github.com/BlankerL/DXY-COVID-19-Crawler
这个是更早的项目,star和folk的人也很多。数据从2020-现在,还在更新。主要是各省市和其他各国的数据。注意的是这是累计数据,累计确诊、累计无症状等等。
我用这里面数据计算出的当日确诊和上海卫健委爬取数据对比了下,75%左右的准确率吧。
当然作者也提供API之类的,但我没有仔细研究。
综合,我选择了自己改写上海项目的代码,爬取我需要城市的卫健委数据
另,我用的是jupyter notebook,以下四部分分别一个cell,
这一块主要就是,爬取卫健委网页里每条新闻,存为html格式到指定路径
json文件自己创建一个空的就行
import os
import json
import hashlib
import requests
from bs4 import BeautifulSoup
archived_html_dir = "你的存放urls.json的父路径"
urls_crawled_filename = f"{archived_html_dir}/urls.json"
def clean_filename(filename):
# Remove special characters except for underscores and hyphens
cleaned_filename = re.sub(r'[^\w\-]', '_', filename)
return cleaned_filename
def write_file(content, filename):
with open(filename, 'w',encoding='utf-8-sig') as f:
f.write(content)
def read_file(filename):
with open(filename, 'r',encoding='utf-8-sig') as f:
return f.read()
#读取urls.json的内容,转换为json格式
def get_urls_crawled():
if os.path.exists(urls_crawled_filename):
content = read_file(urls_crawled_filename)
try:
return json.loads(content)
except json.JSONDecodeError as e:
print("Error decoding JSON:", e) #如果是空json,返回
return []
return []
#爬取
def get_html_content(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
r = requests.get(url, headers=headers)
return r.text
#爬取链接内容,获得哈希值,保存内容
def crawl_url(target_url, text):
urls = []
hyperlink_html_content = get_html_content(target_url)
hashname = hashlib.md5(
hyperlink_html_content.encode('utf8')).hexdigest() #获得哈希值
filename = f"{hashname}.html"
urls.append(
{"url": target_url, "text": text, "filename": filename}) #将哈希值存到字典urls中
write_file(hyperlink_html_content,
f"{archived_html_dir}/{filename}") #保存url爬取到的内容为html格式
return urls
def crawl(pages, urls_crawled):
urls = []
for p in pages:
if p == '':
url = f"http://wjw.sz.gov.cn/yqxx/"
else:
url = f"http://wjw.sz.gov.cn/yqxx/index_{p}.html"
html_content = get_html_content(url)
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有包含标题和链接的 li 元素
list_items = soup.find_all('li')
# 遍历每个 li 元素,提取标题和日期信息
for li in list_items:
title_element = None # 设置初始值为 None
time_element = None # 设置初始值为 None
href = None
title_element = li.find('a')
date_element = li.find('span')
if title_element and date_element:
title = title_element.get_text()
href = title_element['href']
date = date_element.get_text()
print("Title:", title)
print("Link:", href)
print("Date:", date)
#获得网页哈希值(内容)
if href is not None:
hyperlink_html_content = get_html_content(href)
hashname = hashlib.md5(
hyperlink_html_content.encode('utf8')).hexdigest()
# 有些标题以?等字符结尾,无法正常保存为链接,需转换
cleaned_title = clean_filename(title)
filename = f"{cleaned_title}.html"
# 检查是否已存在该天新闻内容的html文件
file_path = os.path.join(archived_html_dir, filename)
if not os.path.exists(file_path):
print(f'{file_path}信息开始保存')
write_file(hyperlink_html_content,f"{archived_html_dir}/{filename}")
else:
print(f'{file_path}内容已存在')
#检查urls.json文件中是否已存在该天新闻信息
if href in set(map(lambda x: x['url'], urls_crawled)):
print(f'{href}已在urls.json中')
print('************************************')
continue
else:
print(f'{href}添加到urls.json中')
urls.append(
{"url": href, "text": title, "filename": filename})
print('****************************************')
else:
print(f'{href} is none')
return urls
#保存
def save_urls_crawled(urls):
json_data=json.dumps(urls, ensure_ascii=False, indent=4, separators=(',', ':'))
write_file(json_data, urls_crawled_filename) #把数据写入/urls.json文件
这一步是读取每条符合正则表达式的新闻,比如2022年1月20日新冠肺炎疫情情况
因为卫健委一般不止统计疫情情况,还有其他新闻,尤其是深圳,简直多了去了。
读取后将html中关于疫情信息的文本提取出来,正则表达式进行筛选符合的行。找到确诊和无症状数据
原github项目代码中,更加精准,还有无症状转确诊的数据计算之类,我这里没要。
import re
import json
from bs4 import BeautifulSoup
import pandas as pd
# 从每天新闻标题中读取数据
def extract_line(line):
# 12月27日0-24时,深圳新增确诊病例xx和无症状感染者xx。
regex_1 = "^(\\d+)月(\\d+)日.*?深圳新增本土确诊病例(\d+)例.*?无症状感染者(\d+)例.*?"
pattern_1 = re.compile(regex_1, re.IGNORECASE)
m1 = pattern_1.match(line)
if m1 is not None:
(m, d, confirmed, asymptomatic) = m1.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
# 7月7日0-24时,深圳本土新增新冠肺炎确诊病例2例和新冠病毒无症状感染者4例。
regex_15 = "^(\\d+)月(\\d+)日.*?深圳本土新增.*?确诊病例(\d+)例.*?无症状感染者(\d+)例.*?"
pattern_15 = re.compile(regex_15, re.IGNORECASE)
m15 = pattern_15.match(line)
if m15 is not None:
(m, d, confirmed, asymptomatic) = m15.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
##xxx,深圳新增x例本土确诊病例和x例本土无症状
regex_14 = "^(\\d+)月(\\d+)日.*?深圳新增本土(\d+)例确诊.*?和(\d+)例.*?无症状.*?"
pattern_14 = re.compile(regex_14, re.IGNORECASE)
m14 = pattern_14.match(line)
if m14 is not None:
(m, d, confirmed, asymptomatic) = m14.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
##xxx,深圳新增x例本土确诊病例和x例本土无症状
regex_11 = "^(\\d+)月(\\d+)日.*?深圳新增(\d+)例本土确诊.*?和(\d+)例本土无症状.*?"
pattern_11 = re.compile(regex_11, re.IGNORECASE)
m11 = pattern_11.match(line)
if m11 is not None:
(m, d, confirmed, asymptomatic) = m11.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
#12月27日0-24时,深圳无新增确诊病例和无症状感染者。
regex_5 = "^(\\d+)月(\\d+)日.*?深圳无本土确诊病例和本土无症状感染者.*?"
pattern_5 = re.compile(regex_5, re.IGNORECASE)
m5 = pattern_5.match(line)
if m5 is not None:
(m, d) = m5.groups()
return (int(m), int(d), 0, 0)
#12月27日0-24时,深圳无新增确诊病例和无症状感染者。
regex_10 = "^(\\d+)月(\\d+)日.*?深圳无新增确诊病例和无症状感染者.*?"
pattern_10 = re.compile(regex_10, re.IGNORECASE)
m10 = pattern_10.match(line)
if m10 is not None:
(m, d) = m10.groups()
return (int(m), int(d), 0, 0)
#12月27日0-24时,深圳新增本土xx确诊病例xx例 xxxx
regex_2 = "^(\\d+)月(\\d+)日.*?深圳新增新冠肺炎确诊病例(\d+)例.*?"
pattern_2 = re.compile(regex_2, re.IGNORECASE)
m2 = pattern_2.match(line)
if m2 is not None:
(m, d, confirmed) = m2.groups()
return (int(m), int(d), int(confirmed), 0)
#12月27日0-24时,深圳新增x例新冠肺炎确诊 xxxx
regex_6 = "^(\\d+)月(\\d+)日.*?深圳新增(\d+)例新冠肺炎确诊病例.*?"
pattern_6 = re.compile(regex_6, re.IGNORECASE)
m6 = pattern_6.match(line)
if m6 is not None:
(m, d, confirmed) = m6.groups()
return (int(m), int(d), int(confirmed), 0)
#xxx,深圳新增xx例无症状 xxxx
regex_4 = "^(\\d+)月(\\d+)日.*?深圳新增(\d+)例新冠病毒无症状.*?"
pattern_4 = re.compile(regex_4, re.IGNORECASE)
m4 = pattern_4.match(line)
if m4 is not None:
(m, d, asymptomatic) = m4.groups()
return (int(m), int(d), 0, int(asymptomatic))
#xxx,深圳本土新增xx无症状x例
regex_16 = "^(\\d+)月(\\d+)日.*?深圳本土新增.*?无症状感染者(\d+)例.*?"
pattern_16 = re.compile(regex_16, re.IGNORECASE)
m16 = pattern_16.match(line)
if m16 is not None:
(m, d, asymptomatic) = m16.groups()
return (int(m), int(d), 0, int(asymptomatic))
#xxx,深圳xxx发现xx例无症状 xxxx
regex_7 = "^(\\d+)月(\\d+)日.*?深圳.*?发现(\d+)例新冠病毒无症状.*?"
pattern_7 = re.compile(regex_7, re.IGNORECASE)
m7 = pattern_7.match(line)
if m7 is not None:
(m, d, asymptomatic) = m7.groups()
return (int(m), int(d), 0, int(asymptomatic))
#xxx,深圳新增xx例确诊和xx例无症状 xxxx
regex_3 = "^(\\d+)月(\\d+)日.*?深圳新增(\d+)例新冠肺炎确诊和(\d+)例.*?新冠病毒无症状.*?"
pattern_3 = re.compile(regex_3, re.IGNORECASE)
m3 = pattern_3.match(line)
if m3 is not None:
(m, d, confirmed, asymptomatic) = m3.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
#xxx,xx诊断为确诊 xx诊断为无症状
regex_8 = "^(\\d+)月(\\d+)日.*?深圳新增.*?(\d+)例诊断为新冠肺炎确诊.*?(\d+)例诊断为新冠病毒无症状.*?"
pattern_8 = re.compile(regex_8, re.IGNORECASE)
m8 = pattern_8.match(line)
if m8 is not None:
(m, d, confirmed, asymptomatic) = m8.groups()
return (int(m), int(d), int(confirmed), int(asymptomatic))
#xxx,深圳新增x例本土确诊病例
regex_11 = "^(\\d+)月(\\d+)日.*?深圳新增(\d+)例本土.*?确诊.*?"
pattern_11 = re.compile(regex_11, re.IGNORECASE)
m11 = pattern_11.match(line)
if m11 is not None:
(m, d, confirmed) = m11.groups()
return (int(m), int(d), int(confirmed), 0)
#xxx,深圳本土新增确诊病例x例
regex_17 = "^(\\d+)月(\\d+)日.*?深圳本土新增.*?确诊病例(\d+)例.*?"
pattern_17 = re.compile(regex_17, re.IGNORECASE)
m17 = pattern_17.match(line)
if m17 is not None:
(m, d, confirmed) = m17.groups()
return (int(m), int(d), int(confirmed), 0)
##xxx,深圳本土无新增病例
regex_12 = "^(\\d+)月(\\d+)日.*?深圳无新增病例.*?"
pattern_12 = re.compile(regex_12, re.IGNORECASE)
m12 = pattern_12.match(line)
if m12 is not None:
(m, d) = m12.groups()
return (int(m), int(d), 0, 0)
##xxx,深圳本土无新增病例
regex_13 = "^(\\d+)月(\\d+)日.*?深圳.*?无本土新增病例.*?"
pattern_13 = re.compile(regex_13, re.IGNORECASE)
m13 = pattern_13.match(line)
if m13 is not None:
(m, d) = m13.groups()
return (int(m), int(d), 0, 0)
return 0,0,0,0
# 从html提取文字方便读取数据
def parse_html_to_lines(filename: str):
with open(filename, 'r', encoding='utf-8-sig') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# 通过 CSS 选择器选取该 HTML 内容中的 p 标签
span_elems = soup.select(".news_cont_d_wrap p") # 深圳这里是这个类
# 获得该 HTML 中所有文字信息
lines = []
for span_elem in span_elems:
text = span_elem.get_text().strip()
if text: # 跳过空行
lines.append(text)
return lines
def parse_lines_to_json(lines):
total_found = False
total = []
for line in lines:
print(line)
#获得年月日,确诊和无症状数据
if not total_found:
(m, d, confirmed, asymptomatic) = extract_line(line)
total_found = True
total = dict(
date=f"{m:0>2}-{d:0>2}",
confirmed=int(confirmed),
asymptomatic=int(asymptomatic),)
print(f'标题数据:{total}')
continue
else:
print('未找到新闻里匹配的文字')
return total #返回给main函数
# 从html中先读取文本,再从文本中提取需要数据
def parse_html_to_json(filename: str):
lines = parse_html_to_lines(filename) # 提取html中文字内容
return parse_lines_to_json(lines)
def generate_json_files(urls,archived_html_dir):
# 2022年1月20日新冠肺炎疫情情况
regex = r"(\d+)年(\d+)月(\d+)日深圳市新冠肺炎疫情情况"
pattern = re.compile(regex, re.IGNORECASE)
for url in urls:
text = url['text']
print(f'新闻标题为:{text}')
m = pattern.match(text)
if m is None:
print('新闻标题和正则表达式没匹配上')
print('**********************')
continue
else:
filename = archived_html_dir +'/' + url['filename']
print(f"Parse: {text}, filename: {filename}")
if os.path.exists(filename):
print(f'正则表达式匹配结果为:{m}')
total = parse_html_to_json(filename)
#将各区和总数据保存为csv
date=total['date']
df_total=pd.DataFrame([total])
df_total.to_csv(
"你的路径/{}_total.csv".format(date),
encoding='utf-8-sig', index=False)
print(df_total)
print('**********************')
else:
print(str(url['filename'])+'在当前文件夹不存在')
print('**********************')
return total
# 深圳新增境外输入,在无本土确诊时显示该字样,在有本土确诊也显示。无法区分。
# 因此读取 ‘6月4日深圳无本土新增病例.html’ ,直接将对应日期的确诊记为0,不用正则表达式做筛选
def generate_none_confirmed(urls, archived_html_dir):
regex = r"(\d+)月(\d+)日深圳无本土新增病例"
pattern = re.compile(regex, re.IGNORECASE)
for url in urls:
total_none_confirmed = []
text = url['text']
m = pattern.match(text)
if m is None:
continue
else:
(month, day) = m.groups()
total_none_confirmed=dict(
date=f"{month:0>2}-{day:0>2}",
confirmed=0,
asymptomatic=None) # Use None instead of nan
#存为csv
date = total_none_confirmed['date']
df_total_none_confirmed=pd.DataFrame([total_none_confirmed])
df_total_none_confirmed.to_csv(
"你的路径/{}_total.csv".format(date),
encoding='utf-8-sig', index=False)
print(df_total_none_confirmed)
return total_none_confirmed
这里注意,先运行1,再注释掉1,运行2.1,依次类推
1是调动前面的爬虫,2是对爬取的html进行解析,获得疫情数据
import pandas as pd
import os
from datetime import datetime, timedelta
# 1 爬取卫健委疫情新闻
# pages = [''] # 此时仅读取https://wsjkw.sh.gov.cn/yqtb/第一页
# pages=list(range(3,77)) # 2022年全年
# urls_crawled = get_urls_crawled() #读取已存在的urls.json文件
# urls = crawl(pages, urls_crawled) #卫健委官网读取每日链接。检查是否已存在在urls.json中。保存爬取内容
# urls.extend(urls_crawled) # urls是读取page后的内容,如果原来urls中内容不重复,需要合并
# save_urls_crawled(urls)
# # 2 解析html
# # 2.1 先读取爬取生成的urls.json文件
archived_html_dir = '你的html保存路径'
urls_crawled_filename = f"{archived_html_dir}/urls.json"
urls_crawled = get_urls_crawled() #读取已存在的urls.json文件
len_urls = len(urls_crawled)
print(urls_crawled[0])
# # # 2.2 解析每天疫情新闻
# # # 2022年1月22日xxx市新冠肺炎疫情情况'
total = generate_json_files(urls_crawled[0:len_urls],archived_html_dir)
# 6月4日深圳无本土新增病例(将此类新闻标题的确诊设为0)
# toal_none_confirmed = generate_none_confirmed(urls_crawled[0:len_urls],archived_html_dir)
这一步就我加的,因为正则表达式很难把所有数据完整获得,总用缺失的天数,
这里就是把保存的total内容,合并为一个dataframe,再检查缺哪些天,把缺失的天插入进去,直观知道哪些天数缺了
import os
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
folder_path = '你的total路径'
file_names = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
# 合并数据
merged_df = pd.DataFrame()
for file_name in file_names:
file_path = os.path.join(folder_path, file_name)
df = pd.read_csv(file_path, encoding='utf-8-sig')
merged_df = pd.concat([merged_df, df], ignore_index=True)
# 定义一个函数来进行日期格式转换
def convert_date(date_str):
full_date = f"2022-{date_str}"
parsed_date = pd.to_datetime(full_date, format="%Y-%m-%d")
formatted_date = parsed_date.strftime("%Y-%m-%d")
return formatted_date
# 使用 apply 方法将函数应用到日期列,替换原始数据
merged_df['date'] = merged_df['date'].apply(convert_date)
# 设置2022全年为日期范围
start_date = datetime(2022, 1, 1).date()
end_date = datetime(2022, 12, 31).date()
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
# 查找缺失数据
missing_dates = []
for date in date_range:
if str(date) not in merged_df['date'].astype(str).values: # 将日期值转换为字符串进行比较以精确匹配
missing_dates.append(date)
else:
continue
if len(missing_dates) > 0:
print("缺失的日期:")
for date in missing_dates:
print(date.strftime('%Y-%m-%d'))
else:
print("没有日期缺失")
# 生成缺失日期的DataFrame
missing_data = pd.DataFrame({
'date': [date for date in missing_dates],
'confirmed': [float('nan')] * len(missing_dates),
'asymptomatic': [float('nan')] * len(missing_dates)
})
# 合并原始数据和缺失数据
result_data = pd.concat([merged_df, missing_data], ignore_index=True)
# result_data.sort_values(by=['date'], inplace=True) # 按照日期排列
result_data.to_csv('你的最终文件保存路径',encoding='utf-8-sig')
相比于github项目代码,深圳这里没有对各个区的数据进行爬取,因为正则表达式真的,蛮难写好以准确获得想要的数据
但是对各区获取的逻辑是一样的,就是对文本每行进行筛选,符合要求的就保存
有兴趣的可以自己试试
此外,原项目提供了保存csv、json等方式,我习惯csv操作,所以改了点。
parse中的正则表达式有些重复和不完善的,但我懒得改了,缺的一些数据人工补录了。
就酱。没事还是别做疫情了,风口在过去,奈何数据限制,我有这个背景需要。