Python 从国家统计局爬取全国各省市区数据到mysql

# -*- coding: utf-8 -*-
import re
import requests
import time
import operator
import pymysql
from functools import reduce
import uuid

n = 0
save_route = 'E://China_Province_2019_test.txt'  #数据储存路径
conn = pymysql.connect(
 host='localhost',
 user='root',
 passwd='root',
 database='province',
 charset='utf8'
)
code = str(uuid.uuid4())


def fun_getName(result4):
   result4a = []
   s=0
   for i4a in result4:
       if '0' in i4a[1]:
           s += 1
       else:
           result4a.append(i4a)
   return result4a
def fun_write_to_txt(address):
   with open(save_route, 'a', encoding='utf-8')as f:
           f.write("---"+address)
           f.write('\n')
           f.close()

def fun_Insert_to_db(pkcode,value,fkcode,ntype):
   cursor = conn.cursor()
   sql = "INSERT INTO tbprovince (pkcode,sname,fkcode, ntype) VALUES (%s,%s,%s,%s)"
   val = (pkcode,value,fkcode,ntype)
   cursor.execute(sql, val)
   conn.commit()

def fun_Query_from_db(value,fkcode):
   cursor = conn.cursor()
   #sql ="select pkcode from tbprovince where sname='"+str(value)+"'"
   sql ="select pkcode from tbprovince where sname='"+str(value)+"' and fkcode = '"+fkcode+"'"
   print(sql)
   res = cursor.execute(sql)
   print(res)
   #ss = cursor.fetchone()
   #print(ss[0])
   #cursor.close()
   #conn.close()
   return res
def fun_Get_fkcode_db(value):
   cursor = conn.cursor()
   sql ="select fkcode from tbprovince where sname='"+str(value)+"'"
   print(sql)
   res = cursor.execute(sql)
   if(res):
       ss = cursor.fetchone()
   print(ss[0])
   #cursor.close()
   #conn.close()
   return ss[0]
def fun_Get_pkcode_db(value,fkcode):
   cursor = conn.cursor()
   sql ="select pkcode from tbprovince where sname='"+str(value)+"'and fkcode = '"+fkcode+"'"
   print(sql)
   res = cursor.execute(sql)
   if(res):
       ss2 = cursor.fetchone()
   print(ss2[0])
   #cursor.close()
   #conn.close()
   return ss2[0]


# areas = ['月湖区','余江区','贵溪市']
# for area in range(len(areas)):
#     pkid  = str(uuid.uuid4())
#     fun_Insert_to_db(pkid,areas[area],"d28bb303-64aa-4abd-954c-258b2d0fe992","2")

results2 = []
results3 = []
results4 = []
results5 = []
Dates1 = []
kv = {'user-agent': 'Mozilla/5.0'}

url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
pattern = re.compile("(.*?)<")   # 正则表达式
result1 = list(set(re.findall(pattern, r.text)))  # 从主页面获取子页面的html
print('result1')
#print(result1)
i2 = 0
for i2 in range(len(result1)):
   try:
       url2a = result1[i2][0]
       address1 = result1[i2][1]  # 一级地址
       #fun_write_to_txt(address1)
       if(fun_Query_from_db(address1,'')):
           print(address1) #存在记录就把pkcode取出来
           code2 = fun_Get_pkcode_db(address1,'')
       else:
           code2 = str(uuid.uuid4())#不存在记录就插入一条
           fun_Insert_to_db(code2,str(address1),"","0")
       
       i2 += 1
       url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + url2a
       #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/44.html
       #print(url2)
       #print(address1)
       time.sleep(3)
       r2 = requests.get(url2, headers=kv)
       r2.raise_for_status()
       r2.encoding = r2.apparent_encoding
       pattern2 = re.compile("(.*?)<")  # 正则表达式提取目标字段
       result2 = list(set(re.findall(pattern2, r2.text)))
       shinames =  fun_getName(result2)
       m2  = 0
       for m2 in range(len(shinames)):
           url2a2 = shinames[m2][0]
           address2 = shinames[m2][1]  # 一级地址
           #fun_write_to_txt(address2)
           fcode3 = code2
           if(fun_Query_from_db(address2,fcode3)):
               fcode3 = fcode3 #存在记录就把fkcode取出来
               code3  = fun_Get_pkcode_db(address2,fcode3)
           else:
               code3 = str(uuid.uuid4())#不存在记录就插入一条
               fun_Insert_to_db(code3,str(address2),fcode3,"1")
           m2 += 1
           url2 = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/' + url2a2
           #http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/44.html
           #print(url2)
           #print(address1)
           time.sleep(3)
           r2 = requests.get(url2, headers=kv)
           r2.raise_for_status()
           r2.encoding = r2.apparent_encoding
           pattern2 = re.compile("(.*?)<")  # 正则表达式提取目标字段
           result3 = list(set(re.findall(pattern2, r2.text)))
           shinames2 =  fun_getName(result3)
           m3 = 0
           for m3 in range(len(shinames2)):
               #url2a3 = shinames[m3][0]
               address3 = shinames2[m3][1]  # 一级地址
               #fun_write_to_txt(address3)
               fcode4 = code3
               if(fun_Query_from_db(address3,fcode4)):
                   fcode4 = fcode4 #存在记录就把fkcode取出来
                   code4  = fun_Get_pkcode_db(address3,fcode4)
               else:
                   code4 = str(uuid.uuid4())#不存在记录就插入一条
                   fun_Insert_to_db(code4,str(address3),fcode4,"2")
               m3 += 1
             
           #print(shinames)

   except Exception as e:
       print(e)
       pass

print('well_done')

省市区.png

你可能感兴趣的:(Python 从国家统计局爬取全国各省市区数据到mysql)