#from bs4 import BeautifulSoup
import codecs
import requests
import csv
import random
import re
import json
import pprint
f = open(
'岗位需求.csv',
mode='a',
encoding='utf-8-sig',
newline='')
#创建一个csv文件,mode=a表示对文件只能写入,encoding是内容文字,newline避免有换行字符等产生
csv__ = csv.DictWriter(
f,
fieldnames = [
'职位名称',
'基本信息',
'公司名字',
'工作地点',
'公司类型',
'公司规模',
'公司性质',
'福利',
'工资',
'信息发布时间',
'职位详情页']
)
#f是创建的csv文件,fieldnames表示列名
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) Gecko/20100101 Firefox/61.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
]
headers = {'User-Agent': random.choice(user_agent_list)}
url="https://search.51job.com/list/210600,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
response = requests.get(
url=url,
headers=headers
)
#获取资源路径下的网页文件
#print(response.text)
html_data=re.findall('window.__SEARCH_RESULT__ =(.*?)',response.text)[0]
#运用正则表达式findall找到需要的资源,[0]表示爬取出来的是字符串
json_data=json.loads(html_data)
#用json.loads对获取到的字符串进行解码返回python字段
pprint.pprint(json)
engine=json_data['engine_jds']
#找到这个字段的内容
pprint.pprint(engine)
#wbk=xlwt.Workbook()
#sheet=wbk.add_sheet("python岗位")
for i in engine:
# pprint.pprint(i)
title=i['job_name']
attribute_text=i['attribute_text']
jjj = ' '.join(attribute_text)
company_name=i['company_name']
companyind_text=i['companyind_text']
companysize_text=i['companysize_text']
companytype_text=i['companytype_text']
jobwelf=i['jobwelf']
providesalary_text=i['providesalary_text']
updatedate=i['updatedate']
job_href=i['job_href']
workarea_text=i['workarea_text']
#对找到的列表拆分为多个字典内容
dit={
'职位名称':title,
'基本信息':jjj,
'公司名字':company_name,
'工作地点':workarea_text,
'公司类型':companyind_text,
'公司规模':companysize_text,
'公司性质':companytype_text,
'福利':jobwelf,
'工资':providesalary_text,
'信息发布时间':updatedate,
'职位详情页':job_href
}
csv__.write(dit)
#把拆分的数据整合进一个新的字典