爬虫--拉勾网"数据分析"岗招聘信息爬取

主程序

import requests
import json
from lxml import etree
from mysqlhelper import MysqlHelper
import time
import random

class LaGouSpider():
    def __init__(self):
        # 选取热门城市
        self.sqlHelper = MysqlHelper()
        self.insertSql = '''INSERT INTO lagou VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''
        self.headers = {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Referer": "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E4%B8%8A%E6%B5%B7",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
        }

        self.area_list = ['北京', '上海', '深圳', '广州', '杭州', '成都', '南京', '武汉', '西安', '厦门', '长沙', '苏州', '天津']
        self.index_url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}"
        self.start_url = "https://www.lagou.com/jobs/positionAjax.json?"
        self.get_pageNum = ""
    def delay_time(self):
        randtime = random.randint(4,6)
        time.sleep(randtime)

    def get_cookies(self, s,city):
        response = requests.get(self.index_url.format(city), headers=self.headers)
        return response.cookies

    def get_num(self, city):
        response = requests.get(self.index_url.format(city), headers=self.headers)
        html_ele = etree.HTML(response.text)
        pageNum = html_ele.xpath('//span[@class="span totalNum"]/text()')
        if pageNum:
            return pageNum[0]
        else:
            return 1
    def get_response(self,s, url, city, pagenum):
        start_params = {
            "city": city,
            "needAddtionalResult": "false"
        }
        start_data = {
            "first": "false",
            "pn": pagenum,  # 页码
            "kd": "数据分析"
        }
        response = s.post(url, headers=self.headers, cookies=self.cookies, params=start_params,
                                 data=start_data)
        return response


    def save_data(self, city, positionName, district, companyShortName, salary, workYear, education, financeStage,
                  companySize,
                  companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
                  firstType, thirdType, skillLables):
        data = (city, positionName, district, companyShortName, salary, workYear, education, financeStage, companySize,
                companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
                firstType, thirdType, skillLables)
        self.sqlHelper.execute_modify_sql(self.insertSql, data)

    def main(self):
        for city in self.area_list:

            pageNum = self.get_num(city)
            self.delay_time()
            for num in range(1, int(pageNum) + 1):
                s = requests.Session()
                self.cookies = self.get_cookies(s, city)
                self.delay_time()
                response = self.get_response(s,self.start_url, city, num)
                print(response.text)
                json_ele = json.loads(response.text)
                zhaopin_list = json_ele['content']['positionResult']['result']
                # print(zhaopin_list)
                for info in zhaopin_list:
                    positionName = info['positionName']
                    district = info['district']
                    companyShortName = info['companyShortName']
                    salary = info['salary']
                    workYear = info['workYear']
                    education = info['education']
                    financeStage = info['financeStage']  # 融资
                    companySize = info['companySize']
                    companyLabelList = info['companyLabelList']  # 公司标签列表
                    companyLabel = ",".join(companyLabelList)
                    positionLablesList = info['positionLables']  # 小标签
                    if type(positionLablesList) == list:
                        positionLable = ",".join(positionLablesList)
                    else:
                        positionLable = info['positionLables']
                    longitude = info['longitude']  # 经度
                    latitude = info['latitude']  # 维度
                    formatCreateTime = info['formatCreateTime']  # 发布时间
                    companyFullName = info['companyFullName']
                    hitagsList = info['hitags']  # 福利
                    if type(hitagsList) == list:
                        hitags = ",".join(hitagsList)
                    else:
                        hitags = info['hitags']
                    firstType = info['firstType']
                    thirdType = info['thirdType']
                    skillLablesList = info['skillLables']
                    if type(skillLablesList) == list:
                        skillLables = ",".join(skillLablesList)
                    else:
                        skillLables = info['skillLables']
                    positionId = str(info['positionId'])
                    # print(city, positionName, district, companyShortName, salary, workYear, education, financeStage,
                    #       companySize,
                    #       companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,
                    #       firstType, thirdType, skillLables)
                    print("[INFO]:",city,num)
                    # 保存数据
                    self.save_data(city, positionName, district, companyShortName, salary, workYear, education,
                                   financeStage,
                                   companySize,
                                   companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName,
                                   hitags,
                                   firstType, thirdType, skillLables)
                self.delay_time()

if __name__ == '__main__':
    spider = LaGouSpider()
    spider.main()

MysqlHelper类文件

import pymysql

class MysqlHelper(object):
    def __init__(self):
        self.conn = pymysql.connect(host='127.0.0.1', port=3306,
                                    user='root', passwd='666666',
                                    db='test', charset='utf8mb4')
        self.cursor = self.conn.cursor()

    def execute_modify_sql(self, sql, data):
        self.cursor.execute(sql, data)
        self.conn.commit()

    def __del__(self):
        self.cursor.close()
        self.conn.close()

你可能感兴趣的:(爬虫)