python爬取中国各省市四级行政规划

一、需求

爬取行政规划网(http://www.xzqy.net)的各省市四级行政规划(精确到乡镇街道)做成Excel表格。

二、具体步骤

  • 设置代理ip
    原本以为该网站不会限制ip,结果刚爬完一个省的数据,就gg了。。
    爬取http://www.xicidaili.com/nn/的可用ip作为代理,并保存在一个文件夹中。
from bs4 import BeautifulSoup
import requests
import random
import lxml
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list

if __name__ == '__main__':
    url = 'http://www.xicidaili.com/nn/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }
    ip_list = get_ip_list(url, headers=headers)
    #proxies = get_random_ip(ip_list)
    file=open('G://ip.txt','w+') #新建文件夹保存可用的代理ip
    urlTest="https://www.baidu.com"
    for i in range(len(ip_list)):
      ip= 'http:\\' + ip_list[i];
      proxies = {'proxy': ip}
      try:                                  #检测ip是否可用
          ss = requests.get(urlTest, proxies=proxies)
          if str(ss) == '':
              str1 = ip_list[i] + "\n"
              file.write(str1);

      except Exception as e:
          print (e)

    file.close()
    print(ip_list)
  • 爬取数据
from bs4 import BeautifulSoup
import requests
import re
import xlwt
import datetime
import time
import random
import threading
def findCity(url):#获取市级或区级信息
    Target = url;
    Target0 = "http://www.xzqy.net/"
    headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
    ip=getip()
    rep = requests.get(url=Target,headers=headers,proxies=ip)
    text = rep.text;
    soup = BeautifulSoup(text, "html5lib")
    city = str(soup.find_all('td', class_="parent")).split('(.*)')
    del city[0]#去除无用信息


    cityUrl = [];
    cityName = [];
    for i in range(len(city)):
        cityU = Target0 + pattern.findall(city[i])[0]
        name = pattern1.findall(city[i])[0]  # 市或区名
        cityUrl.append(cityU)
        cityName.append(name)
    return [cityUrl, cityName]


def getMessage(url):#获取街道信息
    Target = url

    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
    ip = getip()
    rep = requests.get(url=Target, headers=headers, proxies=ip)
    text = rep.text;
    soup = BeautifulSoup(text, "html5lib")
    city = str(soup.find_all('td', class_="parent")).split('(.*)')
    del city[0]

    message = []
    for i in range(len(city)):
        target2 = pattern1.findall(city[i])[0]  # 省名
        message.append(target2)
    return message
def getip():#随机获取可用ip
    f = open("G://ip.txt", 'r')
    lines = f.readlines()
    proxys = []
    for i in range(0, len(lines)):
        ip = lines[i]
        proxy = 'http:\\' + ip
        proxies = {'proxy': proxy}
        proxys.append(proxies)
    pro=random.choice(proxys);
    f.close()
    return pro

def province(url,Pname):#以省为单位保存数据

    city = [];
    name = [];
    [city, name] = findCity(url)
    wrd = xlwt.Workbook()
    sheet = wrd.add_sheet(Pname)
    CityRow = 0;
    row = 0;
    print("即将爬取"+Pname+"数据*************************************")
    for i in range(len(name)):
        sheet.write(CityRow, 0, name[i])

        [countryU, CountryName] = findCity(city[i])
        print("爬取" + name[i] + "数据中")

        for j in range(len(CountryName)):
            sheet.write(row, 1, CountryName[j])
            message = getMessage(countryU[j])
            print('正在爬取' + CountryName[j] + "数据")

            sss="、".join(message)
            sheet.write(row,2,sss)
            print(CountryName[j] + "数据写入完成")
            row = row + 1;
        print(name[i] + "数据写入完成")
        CityRow = CityRow + len(CountryName);
    wrd.save("G://"+Pname+".xls")
    print("爬取完成")

Target="http://www.xzqy.net/"
headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'}
ip = getip()
rep = requests.get(url=Target, headers=headers, proxies=ip)

text=rep.text;
soup=BeautifulSoup(text, "html5lib")
province0=str(soup.find_all('div',class_="navi")).split('')#得到各省名及网址
pattern = re.compile('./(.*)"')
pattern1=re.compile('>(.*)')
del province0[0]
del province0[0]
del province0[-1]
for i in range(0,len(province0)):
    target1=Target+pattern.findall(province0[i])[0]
    target2=pattern1.findall(province0[i])[0]#省名
    province(target1,target2)
    
  • 结果
    耗费一个半小时下载完中国34个省、自治区、直辖市等数据。如下图

    image.png

    image.png

数据及源代码上传至本人github:https://github.com/nixuanhui/spider

你可能感兴趣的:(python爬取中国各省市四级行政规划)