import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random
import time
from lxml import etree
from pymysql import *
def get_list_url(url):
ua = UserAgent()
useragent=ua.chrome
headers={
'User-Agent':useragent
}
rq=requests.get(url,headers=headers)
print(url)
time.sleep(random.choice(range(1,4)))
rq.encoding='gbk'
soup=BeautifulSoup(rq.text,'lxml')
company_div=soup.find('div',id='resultList').findAll('div',class_='el')
urls=[]
for x in company_div:
try:
url1=x.find('a')['href']
urls.append(url1)
except:
pass
return urls
def get_massage(urls):
for url in urls:
ua = UserAgent()
useragent = ua.chrome
headers = {
'User-Agent': useragent
}
rq = requests.get(url, headers=headers)
print(url)
time.sleep(random.choice(range(1, 4)))
rq.encoding = 'gbk'
data = rq.text
h = etree.HTML(data)
soup=BeautifulSoup(data,'lxml')
try:
zhiwei=h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()')[0]
print(zhiwei)
company = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()')[0]
print(company)
address = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[1]')[0]
print(address)
xingzhi = h.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[1]/text()')[0]
print(xingzhi)
xinzi = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()')[0]
print(xinzi)
xuewei = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[3]')[0].replace(' ','').replace('\\r','')
print(xuewei)
jingyan = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[2]')[0].replace(' ','').replace('\\r','')
print(jingyan)
guimo = h.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[2]/text()')[0]
print(guimo)
leixing = h.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[3]/a/text()')[0]
print(leixing)
fuli = soup.find('div',class_='t1').find('span').get_text()
print(fuli)
time1 = h.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()[5]')[0].replace(' ','').replace('\\r','').replace('\\n','')
print(time1)
yaoqiu = soup.find('div',class_='bmsg job_msg inbox').get_text().replace(' ','').replace('\\r','').replace('\\n','')
print(yaoqiu)
insert_data(zhiwei=zhiwei, company=company, address=address, xingzhi=xingzhi, xinzi=xinzi, xuewei=xuewei
, jingyan=jingyan, guimo=guimo, leixing=leixing, fuli=fuli, time1=time1
, yaoqiu=yaoqiu)
except:
pass
# #插入数据
def insert_data(**args):
cursor = conn.cursor()
query = cursor.execute("insert into 51job(zhiwei, company, address, "
"xingzhi, xinzi, xuewei, jingyan, guimo,"
" leixing, fuli, time1, yaoqiu)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(args['zhiwei'],args['company'],args['address'],args['xingzhi'],args['xinzi'],args['xuewei']
, args['jingyan'],args['guimo'],args['leixing'],args['fuli'],args['time1'],args['yaoqiu']))
conn.commit()
if __name__=='__main__':
conn = connect(host='127.0.0.1', port=3306, user='root', passwd='', db='qianchengwuyou', charset='utf8')
for page in range(33,753):
url='https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(str(page))
urls=get_list_url(url)
get_massage(urls)