1、cookie经常更换
2、请求速度进行限制
3、请求地址在js里
#coding:utf-8
import json
import time
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
import requests
#__author__='小菜菜1223'
def run(h, page):
url = "https://www.zhipin.com/c101010100/b_%E9%BE%99%E5%B2%97%E5%8C%BA/?query=%E6%B7%B1%E5%9C%B3+%E5%A4%96%E8%B4%B8&page="+str(page)+"&ka=page-"+str(page)
headers = {'referer':'https://www.zhipin.com/job_detail/?query=%E6%B7%B1%E5%9C%B3+%E5%A4%96%E8%B4%B8&city=101010100&industry=&position=',
'Cookie':'lastCity=101010100; _uab_collina=155894826851450523301609; sid=sem_pz_bdpc_dasou_title; __c=1588755259; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2Fbeijing%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDdiHC088qh0KZEgs77XeFX000007hOm-C00000LIIMxh.THdBULP1doZA8QMu1x60UWdBmy-bIfK15yRknWT3nHfknj0sn1TduhR0IHdjPW64rDmznDRLnH0znH9jwWPAPHm3fH6drHbdwbRvr0K95gTqFhdWpyfqn1nYn1mLnjDzniusThqbpyfqnHm0uHdCIZwsT1CEQLILIz4lpA-spy38mvqVQ1q1pyfqTvNVgLKlgvFbTAPxuA71ULNxIA-YUAR0mLFW5Hn4nH0s%26tpl%3Dtpl_11534_21264_17382%26l%3D1516420953%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBOSS%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3343670121_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D140%26ie%3Dutf-8%26f%3D3%26tn%3Dbaidu%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598%25E5%25AE%2598%25E7%25BD%2591%26oq%3Dlagou%26rqlang%3Dcn%26inputT%3D2119%26prefixsug%3Dboss%26rsp%3D0&g=%2Fwww.zhipin.com%2Fbeijing%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&friend_source=0&friend_source=0; __zp_seo_uuid__=9b582547-62f6-40da-bc9c-07ee84e05e24; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1588755259,1588755596,1588757599,1588759051; __a=92101704.1554684563.1583218762.1588755259.75.5.33.33; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1588816418; __zp_stoken__=3ac4%2F0P0uAy6VOL%2B25qF9xRXyhApD5Y8y23Z22kWm%2BWfFbijkLuFTP1FbrKhGhfzsxPzJ%2F%2FU0PNRY7EudEghGAdUfnYKrc7yHrloIsimIbb5uPgrVOtCUE%2FrkVUHwXwpD1hn',
'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
r = requests.get(url,headers=headers)
html = r.text
#print html
soup = BeautifulSoup(html,'lxml')
main(h, soup)
def main(h, soup):
#职位地点信息
acc = []
account_unitnames = soup.find_all(attrs={'class':'job-area'})
for n, v in enumerate(account_unitnames):
acc_name = v.contents[0]
a = ''.join(acc_name)
acc.append(a)
#获取职位信息
pos = []
positions = soup.find_all(attrs={'class':'name'})
for n, v in enumerate(positions):
try:
p = v.a['title']
pos.append(p)
except:
continue
print len(acc), len(pos)
for i in range(len(acc)):
a = acc[i]
p = pos[i]
h.write(a.encode('utf-8')+'~'+p.encode('utf-8')+'\n')
time.sleep(5)
if __name__ == '__main__':
h = open('res.txt', 'w')
for i in range(1,10):
print i
run(h, i)
h.flush()