数据分析---1.数据获取----Boss直聘职位信息获取

千里之行,始于足下

  • 朝闻道,夕死而已
    • 直接上代码

朝闻道,夕死而已

遇到的问题
1.IP被封
2.csv命名不能使用中文

直接上代码

"""
@author: cht
@time: 2019/7/15 22:23
"""

# -*- coding: utf-8 -*-
import random
import requests
from bs4 import BeautifulSoup
import time
from lxml import etree
import csv


user_agent = [
       "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
       "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
       "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
       "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
       "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
       "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
       "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
       "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52"]

class Boss(object):
    def __init__(self):
        self.headers = {"User-Agent": random.choice(user_agent)}
        # 获取一级子界面职位信息

    def gethttpIp(self):
    '''
    如果不想要购买代理IP,可是舍弃该方法,直接爬取一些代理IP'''
        httprul = "代理IP的api"
        result = requests.get(httprul)
        print(result.text)
        IPList = result.text.split(":")
        print(IPList)
        # 代理服务器
        proxyHost = IPList[0]
        proxyPort = IPList[1].split(" ")[0]
        proxyMeta = "https://%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
        }
        proxies = {"https": proxyMeta, }
        print(proxies)
        return proxies
        
    def get_job_duty(self, url,proxies):
        # 获取页面响应内容
        res = requests.get(url, headers=self.headers, proxies=proxies)
        res.encoding = 'utf-8'
        html = res.text
        # print(html)
        # xpath提取职责和要求
        parse_html = etree.HTML(html)
        job_info_list = parse_html.xpath('//*[@id="main"]/div/div[2]/ul/li')
        # print(job_info_list)
        if not job_info_list :
            job_info_list = parse_html.xpath('//*[@id="main"]/div/div[3]/ul/li')
            # print(job_info_list)
            if not job_info_list:
                return False
        jobInfo = []
        try:
            for tr in job_info_list:
                List1 = []
                job_name = tr.xpath("./div/div[1]/h3/a/div[1]/text()")[0]
                List1.append(job_name)
                job_salary = tr.xpath("./div/div[1]/h3/a/span/text()")[0]
                List1.append(job_salary)
                company = tr.xpath("./div/div[2]/div/h3/a/text()")[0]
                List1.append(company)
                address = tr.xpath("./div/div[1]/p/text()[1]")[0]
                List1.append(address)
                worke_seniority = tr.xpath("./div/div[1]/p/text()[2]")[0]
                List1.append(worke_seniority)
                degree = tr.xpath("./div/div[1]/p/text()[3]")[0]
                List1.append(degree)
                job_url = tr.xpath('./div/div[1]/h3/a/@href')[0]
                List1.append(job_url)
                jobInfo.append(List1)
            print(jobInfo)
            self.writeCSV(jobInfo)
            return jobInfo
        except Exception as e:
            print("错误原因:%s"%e)

    def writeCSV(self,jobInfo):
        file = open('C:\\Users\\Administrator\\PycharmProjects\\boss\\java\\java2WorkInfo.csv', 'a', newline='',encoding='gb18030')  # 打开文件
        content = csv.writer(file, dialect='excel')  # 设定文件写入模式
        for unitinfo in jobInfo:
            content.writerow(unitinfo)

    def workOn(self):
        cityList = ["c100010000","c101010100","c101020100","c101280100","c101280600","c101210100","c101110100"\
            ,"c101190400","c101200100","c101230200","c101030100","c101250100","c101270100","c101050100","c101060100","c101070100",\
                    "c101090100","c101100100","c101120100","c101180100","c101190100","c101190200","c101220100","c101230100",\
                    "c101240100","c101260100","c101290100","c101310100","c101300100"]
        # cityList = ["c101020100"]
        for l in cityList:
            baseurl = "https://www.zhipin.com/%s?query=java&page="%l
            print("city%s开始" % l)
            i = 1
            proxies = self.gethttpIp()
            while i <= 10:
                print("第%s页"%i)
                url = baseurl+str(i)
                print("url:%s" % url)
                time.sleep(2)
                result = self.get_job_duty(url,proxies)
                if result == False:
                    print("页面为空")
                    break
                i += 1
            print("完成city%s"%l)
        print("全部结束")


if __name__ == "__main__":
    boss = Boss()
    boss.workOn()

未完待续

你可能感兴趣的:(数据分析---1.数据获取----Boss直聘职位信息获取)