用代理的方式爬取boss直聘的信息

import requests
from bs4 import BeautifulSoup
from get_proxy import GetProxy
from urllib import parse
from day03.pymysql_text import Mysql_text
#请求头信息,放到全局变量方便使用
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
#定义一个函数用来获取boss直聘详情页url
def get_all_info(my_ip):
    #第一个循环用来获取分页信息
    for i in range(1,4):
        url = 'https://www.zhipin.com/c101010100/h_101010100/?query=python&page=%s&ka=page-%s' %(i,i)
        #这个循环是当你ip被封了之后,用except下面的函数获取新的代理ip
        for i in range(4):
            #如果ip失效就换新的代理ip
            try:
                #用BeautifulSoup获取我们想要的数据
                response = requests.get(url,headers=headers,proxies=my_ip.proxy,timeout=5)
                soup = BeautifulSoup(response.text,'lxml')
                a_list = soup.select('div.info-primary > h3 > a ')
                for a_ele in a_list:
                    #循环拼接详情页的路径
                    a_href = a_ele['href']
                    # print(a_href)
                    info_url = parse.urljoin(url,a_href)
                    print(info_url)
                    #这个循环个下面的try except是在ip失效之后获取新的ip
                    for i in range(4):
                        try:
                            #调用获取最终数据的函数
                            get_boss_info(my_ip, info_url)
                            #获取成功就break结束程序,避免代理ip的浪费
                            break
                        except Exception as e:
                            #获取新的代理ip
                            my_ip.updata_proxy()
                            print(e)

            except Exception as e:
                print(e)
                my_ip.updata_proxy()
#这个函数用来获取我们想要的数据
def get_boss_info(my_ip,info_url):
    response = requests.get(info_url,proxies = my_ip.proxy,headers=headers,timeout=5)
    html_ele = BeautifulSoup(response.text,'lxml')

    title = html_ele.select('h1')[0].text
    # print(title)
    price = html_ele.find('span',class_ = "badge").text.replace('\n','').strip()
    # print(price)
    address = html_ele.select('div.info-primary p')[0].text.replace('\n','').strip()
    # print(address)
    yaoqiu = html_ele.select('div.text')[0].text
    # print(yaoqiu)
    data = (title,price,address,yaoqiu)
    print(data)
    #实例化之前封装好的mysql类并实例化和执行数据添加
    m = Mysql_text()
    sql = 'insert into boss(title,price,address,yaoqiu) VALUES (%s,%s,%s,%s)'
    m.sqlzz(sql,data)



if __name__ == '__main__':
    #实例化获取代理的类的对象
    my_ip = GetProxy()
    #调用第一个函数
    get_all_info(my_ip)


你可能感兴趣的:(boss直聘)