爬取51job招聘网

import urllib.request
from bs4 import BeautifulSoup
import time
import pymongo
import pymysql

#https://search.51job.com/list/170200,000000,0000,00,9,99,python,2,2.html

def handle_request(keyword,page,url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36’,
}
url = url.format(keyword,page)
request = urllib.request.Request(url=url,headers=headers)
return request
#用bs4解析
def parse_content(content,db):
soup = BeautifulSoup(content,‘lxml’)
div_list = soup.select(’#resultList > .el’)[1:]

# print(div_list)
#逐一解析
for os in div_list:
    #公司职业
    jobname = os.select('.t1 > span > a')[0]['title']
    #公司名称
    company = os.select('.t2 > a ')[0]['title']
    #工作地点
    area = os.select('.t3')[0].string
    #薪资
    salary = os.select('.t4')[0].string
    #发布时间
    publish_time = os.select('.t5')[0].string

    #print(salary,publishtime)
    items = {
        '公司职业':jobname,
        '公司名称':company,
        '工作地点':area,
        '薪资':salary,
        '发布时间':publish_time,
    }
    #String = str(items)
    #print(items)
    #fp.write(String,'\n')
    save_to_mysql(db,items)
    #fp.insert(items)

#第一种是保存到mysql中
def connect_db():
db=pymysql.Connect(host=‘localhost’,port=3306,user=‘root’,password=‘123456’,database=‘51job’,charset=‘utf8’)
#两种引擎,一种是innodb 一种是myisam
return db

#第二种是保存到mongodb中
def connect_mongodb():
#连接mongodb
client = pymongo.MongoClient(host=‘localhost’,port=27017)
return client
#如果用mysql需要自己建立数据库,再创建对应的表格
def save_to_mysql(db,items):
#获取cursor
cursor = db.cursor()
#拼接sql语句

sql = 'insert into job(jobname, company, area, salary, publish_time) values("%s","%s","%s","%s","%s")' % (items['公司职业'], items['公司名称'], items['工作地点'], items['薪资'], items['发布时间'])
try:
    cursor.execute(sql)
    db.commit()
except Exception as e:
    print(e)
    db.rollback()

def main():
keyword = input(‘请输入要搜索的关键字-’)
start_page = int(input(‘请输入起始页码-’))
end_page = int(input(‘请输入结束页码-’))
url = ‘https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html’
#fp = open(‘job.txt’,‘w’,encoding=‘utf8’)
db = connect_db()
#一次遍历每一页的数据
#client = connect_mongodb()
#选择mongodb的数据库
#db = client.job51
#选择mongodb的集合
#fp = db.job

for page in range(start_page,end_page + 1):
    print('正在爬取--第%s页--....' % page)
    request = handle_request(keyword,page,url)
    content = urllib.request.urlopen(request).read().decode('gbk')
    parse_content(content,db)
    print('结束爬取--第%s页--...' % page)
    time.sleep(2)
db.close()
#fp.close()
#client.close()

if name == ‘main’:
main()

你可能感兴趣的:(爬虫学习)