No22.Python爬虫之使用urllib库爬取拉钩网并写入excel表格

# 这里我们导入一些需要用到的包
import re    # 导入正则模块,用于提取ip地址
import json    # 由于拉钩网返回的数据是json格式,我们json模块来处理
import time    # 避免爬虫爬的太快被封
import random    # 随机模块
import openpyxl    # 操作excel表格
from bs4 import BeautifulSoup    # 用于解析网页
from urllib import request    # 用于对网页发送请求
from urllib import parse    # 用于对数据的编码进行调整


# 因为使用同一个ip地址不断的请求拉钩网会被ban,这里我爬取了某免费代理网站首页的代理,用来动态的更换ip
def get_ip(headers):
    url = "http://www.xicidaili.com/nn/"
    res = request.Request(url, headers=headers) # 发送请求
    resp = request.urlopen(res) # 获取网页
    html = resp.read().decode("utf-8") # 读取网页
    soup = BeautifulSoup(html, "lxml") # 创建一个soup对象
    list_1 = soup.find_all("tr", attrs={"class": "odd"}) # 对网站clss值等于odd的标签进行寻找
    list_1 = list(list_1)
    ip_list = [] # 创建一个空的列表用于存放ip
    for i in list_1:
        i = str(i)
        ret = re.findall(r'\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b', i)    # 匹配ip
        ip_list.append(ret)
    ip_list = [i[0] for i in ip_list]
    return random.choice(ip_list) # 返回一个ip的值

# 该函数用于返回职位的信息
def get_postition_info(url, data, headers, ip):
    handler = request.ProxyHandler({"http":ip}) # 创建一个handler对象
    opener = request.build_opener(handler) # 创建一个opener
    response = request.Request(url, data=parse.urlencode(data).encode("utf-8"), headers=headers, method="POST") # 向目标url发送请求
    response = opener.open(response) # 获取返回值
    info_dict = json.loads(response.read().decode("utf-8")) # 对返回值进行反序列化
    content = info_dict.get("content")
    position_info = content["positionResult"]["result"]
    return position_info # 返回职位信息

# 主函数
def main():
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
        ] # 用于存放User-Agent的值
    url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" # 目标url
    data = {
        "first": "true",
        "kd": "python"
    } # 需要向目标url发送的请求的数据
    wb = openpyxl.Workbook() # 实例化一个workbook对象
    ws = wb.active 
    ws.append(["职位名称", "公司名称", "教育背景", "工作经验", "工作地点", "薪水", "福利"]) # 在第一行加入该信息
    for i in range(1, 31):
        headers = {
            "User-Agent": random.choice(user_agent_list),
            "Cookie":"_ga=GA1.2.1754861990.1532994572; user_trace_token=20180731074931-3488ee45-9453-11e8-a085-5254005c3644; LGUID=20180731074931-3488f280-9453-11e8-a085-5254005c3644; _gid=GA1.2.780489443.1536152530; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAAAGGABCB89F6133A87AC6A0769E51E5E51C76963; TG-TRACK-CODE=search_code; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536155902,1536191279,1536215295,1536233324; LGSID=20180906192844-038b7ea4-b1c8-11e8-8be5-525400f775ce; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DlWbYlzee-L4AwO_DvTNIM-Vvm4cFyIWiuWfSwzcd8PJotqAdjJn-877rZ2tsHkCk%26wd%3D%26eqid%3D8c8d5a4c000238cc000000065b910f67; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2F; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1536233338; LGRID=20180906192858-0b99b8e4-b1c8-11e8-8be5-525400f775ce; SEARCH_ID=af480b6cd28f46adaa9118d02c8c0e42",
            "Referer": "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
        } # 请求头
        data["pn"] = i
        ip = get_ip(headers)
        position_info = get_postition_info(url, data, headers, ip)
        print("正在爬取第%s页数据......" % i)
        for j in range(15):
            position_dict = position_info[j]
            ws.append([position_dict['positionName'], position_dict['companyShortName'], position_dict['education'],
                       position_dict['workYear'], position_dict['city'], position_dict['salary'],
                       position_dict['positionAdvantage']])
        time.sleep(5)
    wb.save("职位信息.xlsx") # 保存excel表

if __name__ == "__main__":
    main()

 

你可能感兴趣的:(python学习之路)