2.爬取智联招聘的职位信息1.0

目标

1.手动输入要爬取的职位,爬取的页数,目标城市
2.将爬取到的数据储存到指定的文件中
3.记录爬取记录到log日志文件中

主页模块

request模块用于爬取数据,lxml模块用于筛选数据(Xpath)
代码:

# coding:utf-8
"""
    爬取智联招聘10天内的所有python相关职位
    得到公司名称://table[@class='newlist'][2]/tbody/tr[1]/td[@class='zwmc']/div/a[1]
    得到公司名称://table[@class='newlist'][2]/tbody/tr[1]/td[@class='gsmc']/a[1]
    月薪:       //table[@class='newlist'][2]/tbody/tr[1]/td[@class='zwyx']
    地点:       //table[@class='newlist'][2]/tbody/tr[1]/td[@class='gzdd']
    一页中所有的职位 //tr[1]/td[@class='zwmc']/div/a[1]
    注意:以上筛选语句在代码中需要删除其中的‘tboday’才能正常获取数据
"""
# 引入相关的模块
import requests
from lxml import etree
import re
import bs4
import time
import datetime
import os

# 设置count用于计数
count = 0

# 要爬取的页数
page = int(raw_input("请输入要爬取的页数:"))
# 要爬取的内容
job_want = raw_input("请输入要爬取的工作:")

# 要爬取的城市
job_city = raw_input("请输入您要爬取的城市(多个城市用‘;’分隔):")

# 设置请求头
headers = {
    "User-Agent": "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
}
page_count = 1
while page_count <= page:
    page_count += 1
    # 设置请求路径
    url = "http://sou.zhaopin.com/jobs/searchresult.ashx?"

    # 设置get参数
    get_data = {
        "jl": job_city,
        "kw": job_want,
        "sm": 0,
        "p": page,
        "isfilter": 0,
        "isadv": 0,
    }

    # 发送请求并获取数据
    response = requests.get(url, headers=headers, params=get_data)
    content = response.text

    # 通过xpath匹配目标数据
    content_xpath = etree.HTML(content)
    print(content_xpath)

    # 得到数据的条数
    job_names = content_xpath.xpath("//tr[1]/td[@class='zwmc']/div/a[1]")
    job_num = len(job_names)
    print(job_num)

    # 爬取的信息编号
    msg_num = 2
    for i in range(0, job_num):
        # 通过xpath匹配具体数据
        # 得到职位名称
        job_name = content_xpath.xpath("//table[@class='newlist'][%s]//tr[1]/td[@class='zwmc']/div/a[1]" % msg_num)
        job_name = job_name[0].xpath("string(.)")

        # 得到该公司的名称
        job_company = content_xpath.xpath("//table[@class='newlist'][%s]//tr[1]/td[@class='gsmc']/a[1]" % msg_num)
        job_company = job_company[0].xpath("string(.)")

        # 得到职位月薪
        job_pay = content_xpath.xpath("//table[@class='newlist'][%s]//tr[1]/td[@class='zwyx']" % msg_num)
        job_pay = job_pay[0].xpath("string(.)")
        # 得到工作地点
        job_place = content_xpath.xpath("//table[@class='newlist'][%s]//tr[1]/td[@class='gzdd']" % msg_num)
        job_place = job_place[0].xpath("string(.)")

        msg_num += 1
        count += 1
        # 拼接完整的一条职位信息
        full_msg = u"%s.职位名称:%s,公司名称:%s,职位月薪:%s,工作地点%s;\n" % (count, job_name, job_company, job_place, job_place)
        print(full_msg)

        # 将信息写入文件中
        with open("zhaopin/zhilian.txt", "a") as f:
            f.write(full_msg.encode("utf-8"))
log_time = str(datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))

log_msg = "爬虫程序在%s,向zhilian.txt文件中存入了%s条数据\n" % (log_time, count)
with open("zhaopin/log.txt", "a") as f:
    f.write(log_msg)

你可能感兴趣的:(2.爬取智联招聘的职位信息1.0)