由于最近需要用到所有城市的数据,故从统计局爬取19年的一级城市数据
import random
import re
import requests
import time
import sys
fileSavePath = 'E://data/China_Province_2019.csv' # 数据储存路径
fileSavePath2 = 'E://data/China_Province_2019_mistake.csv' # 错误信息储存路径
results2 = []
results3 = []
results4 = []
results5 = []
Dates1 = []
n = 0
# 获取一级 省份、直辖市信息
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
response = requests.get(url)
response.raise_for_status() # 如果 HTTP 请求返回了不成功的状态码,Response.raise_for_status() 会抛出一个 HTTPError异常
response.encoding = response.apparent_encoding # response.apparent_encoding从内容中分析出的响应内容编码方式
pattern = re.compile("(.*?)<") # 正则表达式提取目标字段
result1 = list(set(re.findall(pattern, response.text))) # 从首层页面获取进入第二层页面的html
print(result1)
print('*'*55)
# 从一级城市获取二级城市信息
for cycle1 in range(len(result1)):
try:
url1 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/{0}'.format(result1[cycle1][0]) # 一级城市url
address1 = result1[cycle1][1] # 一级城市
# print('{0} {1}'.format(address1, url1))
response1 = requests.get(url1)
response1.raise_for_status()
response1.encoding = response1.apparent_encoding
response1.close()
pattern1 = re.compile("(.*?)<") # 正则表达式提取目标字段
result2_1 = list(set(re.findall(pattern1, response1.text)))
result2 = []
for result in result2_1: # 爬取的城市信息和城市代码混在一起,需要将代码清除
if '0' not in result[1]:
result2.append(result)
# print(result)
print(result2)
for cycle in result2:
address = '{0}/{1}/{2}'.format(address1, cycle[1],cycle[0])
print(address)
with open(fileSavePath, 'a', encoding='utf-8')as f:
f.write(address)
f.write('\n')
f.close()
except:
print("Unexpected error:", sys.exc_info())
with open(fileSavePath2, 'a', encoding='utf-8')as f:
f.write('{0}|一级错误|一级错误|一级错误|{1}\n'.format('xd', sys.exc_info()))
f.close()
time.sleep(10)
continue
time.sleep(random.random() * 5) # 模拟用户浏览,防止因为爬取太频繁导致ip被封。
print('一级城市导出完成!')