Python爬虫--51job爬取岗位信息并写入txt文件

  • 有借鉴有修改
"""
user:long
"""
import re
import time
from bs4 import BeautifulSoup
#from pack.DbUtil import DbUtil
from pack.RequestUtil import RequestUtil

# 要查找的关键字
print("请输入您需要推荐匹配的关键字:\n")
key_ = str(input())
keywords = []
keywords.append(key_)
# 使用with 语句去打开文本文件,记录进行
f = open('record.txt',mode='w')
f.writelines(key_+'字段相应推荐匹配的就业岗位有:\n')

for keyword in keywords:

    cur_page = 1
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,@keyword,2,@cur_page.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' \
        .replace('@keyword', str(keyword)).replace('@cur_page', str(cur_page))
    req = RequestUtil()
    html_str = req.get(url)

    # 从第一页中查找总页数
    soup = BeautifulSoup(html_str, 'html.parser')  # 推荐使用lxml
    the_total_page = soup.select('.p_in .td')[0].string.strip()
    the_total_page = int(re.sub(r"\D", "", the_total_page))  # 取数字
    print('适合的就业岗位有:\n')
    if the_total_page > 300:
        the_total_page = 100 # 强行10页。太多推荐很怪
    while cur_page <= the_total_page:
        """
        循环获取每一页
        """
        url = 'https://search.51job.com/list/030200,000000,0000,00,9,99,@keyword,2,@cur_page.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' \
            .replace('@keyword', str(keyword)).replace('@cur_page', str(cur_page))
        req = RequestUtil()
        html_str = req.get(url)

        if html_str:
            soup = BeautifulSoup(html_str, 'html.parser')
            #print(soup.prettify()) #格式化打印
            the_all = soup.select('.dw_table .el')
            del the_all[0]

            # 读取每一项招聘
            dict_data = []
            for item in the_all:
                job_name = item.find(name='a').string.strip()
                company_name = item.select('.t2')[0].find('a').string.strip()
                area = item.select('.t3')[0].string.strip()
                pay = item.select('.t4')[0].string
                update_time = item.select('.t5')[0].string.strip()

                dict_data.append(
                    {'job_name': job_name, 'company_name': company_name, 'area': area, 'pay': pay,
                     'update_time': update_time, 'keyword': keyword}
                )
            print(dict_data[0]['job_name'])
            # 写入文本当中
            f.writelines(dict_data[0]['job_name']+'\n')
            time.sleep(0.5)

        else:
            print('keyword:', keyword, 'fail page:', cur_page)

        # 页数加1
        cur_page += 1

    else:
        print('keyword:', keyword, 'fetch end...')
        print('匹配记录保存在record.txt上')
else:
    print('Mission complete!!!')

运行结果:
Python爬虫--51job爬取岗位信息并写入txt文件_第1张图片

你可能感兴趣的:(Python)