python爬取五级地址

【新手,代码水平请见谅,后期会优化】

数据网页链接:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/

import urllib.request
from bs4 import BeautifulSoup
import time
import random


def url_open( url ):
    formate = {
        'User - Agent': 'Mozilla / 5.0(WindowsNT10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 65.0.3325.181Safari / 537.36'
     }
    data = urllib.parse.urlencode(formate).encode(encoding='UTF8')
    req = urllib.request.Request(url=url, data=data,method='POST')
    response = urllib.request.urlopen(req)
    html = response.read().decode('gbk')
    return html


def bs(html, url):
    # 从北京市开始
    i = 0
    soup = BeautifulSoup(html, "html.parser")
    # 获取省
    for li in soup.find_all('tr', class_="provincetr"):
        for a in li:
            if i > 0:
                i = i - 1
            else:
                province = a.find('a').get_text()
                # print(province)
                
                # 获取省的下一级链接(市)
                b = a.find('a')
                plink = b.get('href')
                # print(plink)
                i = 1
                # 获取市的相关数据
                citylink = url_open(url + plink)
                # print(citylink)
                citysoup = BeautifulSoup(citylink, "html.parser")
                for citytr in citysoup.find_all('tr', class_="citytr"):
                    for citya in citytr:
                        # time.sleep(random.randint(3000, 7000) / 1000)  # 随机等待2~7秒
                        city = citya.find('a').get_text()
                        # print(city)

                        if i > 0:
                            pass
                            i = i - 1
                        else:
                            # 获取市的下一级链接
                            c = citya.find('a')
                            plink2 = c.get('href')
                            # print(plink2)

                            i = i+1
                            # 获取县区的相关数据
                            countylink = url_open(url + plink2)
                            # print(plink[0:1])
                            countsoup = BeautifulSoup(countylink, 'html.parser')
                            for countytr in countsoup.find_all('tr', class_="countytr"):
                                for countya in countytr:
                                    # time.sleep(random.randint(1000, 3000) / 1000)  # 随机等待2~7秒
                                    if countya.find('a') is None:
                                        count = countya.string
                                        # print(count)
                                    else:
                                        county = countya.find('a').get_text()
                                        count = county
                                        # print(count)
                                        if i > 0:
                                            pass
                                            i = i - 1
                                        else:
                                            # 获取县区的下一级链接
                                            d = countya.find('a')
                                            plink3 = d.get('href')
                                            # print(plink3)

                                            i = i + 1
                                            # 获取镇、街道办事处相关数据
                                            townlink = url_open(url + plink2[0:2]+'/'+plink3)
                                            townsoup = BeautifulSoup(townlink, 'html.parser')
                                            for towntr in townsoup.find_all('tr', class_='towntr'):
                                                for towna in towntr:
                                                    town = towna.find('a').get_text()
                                                    # print(town)
                                                    # time.sleep(random.randint(700, 1500) / 1000)  # 随机等待2~7秒

                                                    if i > 0:
                                                        pass
                                                        i = i - 1
                                                    else:
                                                        # 获取街道、镇的下一级链接
                                                        e = towna.find('a')
                                                        plink4 = e.get('href')
                                                        # print(plink4)

                                                        i = i + 2
                                                        # 获取居委会、村委会相关数据
                                                        villagelink = url_open(url + plink2[0:2]+'/'+plink3[0:2]+'/' + plink4)
                                                        villagesoup = BeautifulSoup(villagelink, 'html.parser')
                                                        for villagetr in villagesoup.find_all('tr', class_='villagetr'):
                                                            for villagetd in villagetr:
                                                                time.sleep(random.randint(500, 1000) / 1000)  # 随机等待2~7秒
                                                                villagetd.find_all('td')
                                                                # print(villagetd.string)
                                                                if i > 0:
                                                                    pass
                                                                    i = i - 1
                                                                else:
                                                                    village = villagetd.string
                                                                    i = i + 2
                                                                    print(province+','+city+',' + count + ',' + town + ',' + village)
                                                                    with open('D:/地址/address.txt', 'a') as f:
                                                                        f.write(province+',' + city + ',' + count + ','+town+',' + village)
                                                                        f.write('\n')
                                                        i = 1


url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/"

html = url_open(url)
bs(html, url)

# print(html)

你可能感兴趣的:(爬虫)