代码时间:2018-12-25
这段代码很简单,并没有使用代理池,也没有设置爬取延迟,获取数据还是很快的,获取大约2w条招聘信息;
话不多说,来看一下我的代码:
url管理器
#url管理器
class url_magen():
def __init__(self):
self.newurl=set()
self.oldurl=set()
#URL set
def url_set(self,url):
if url not in self.oldurl and url != '':
self.newurl.add(url)
else:
print('URL违规')
def urls_set(self,urls):
for url in urls:
self.url_set(url)
#URL judge
def url_judge(self):
if len(self.newurl):
return True
else:
return False
#URL get 生成器
def url_get(self):
if self.url_judge():
url=self.newurl.pop()
yield url
else:
return
html下载器
#html 下载
class html_download():
def __init__(self,url,headers):
self.url=url
self.headers=headers
def download(self):
req=requests.get(self.url,headers=self.headers)
req.encoding='utf-8'
return req.text
html解析器
#html解析器
class html_analysis():
def __init__(self,text_t):
self.text_t=text_t
self.zl_dict={}
self.urls=set()
def url_analysis(self):
html_t=html.etree.HTML(self.text_t)
Type_A=html_t.xpath('//div[@class="zp-jobNavigater__pop--title"]/text()')
for i in Type_A:
Type_B=html_t.xpath('//div[text()="%s"]/following-sibling::div/a/text()'%i)
self.zl_dict[i]=Type_B
for j in Type_B:
self.urls.add('https://fe-api.zhaopin.com/c/i/sou?pageSize=100&cityId=489&kw=%s&kt=1'%j)
return self.urls,self.zl_dict
def data_analysis(self):
json_t = json.loads(self.text_t)
return json_t
#数据存储器
#数据储存器
class data_save():
def __init__(self,data,save_path,url_s):
self.data=data
self.save_path=save_path
self.url_s=url_s
def dsave(self):
gz_infos=self.data['data']['results']
for now_path,dirs,file in os.walk(self.save_path):
if now_path != self.save_path:
n = 0
for gz_info in gz_infos:
with open('%s/%s.txt'%(now_path,n),'a') as f:
f.write('url:%s\n'%self.url_s)
f.write('time:%s\n'%time.strftime('%Y%m%d'))
f.write('#syc')
f.write(str(gz_info))
n=n+1
#爬虫调度器
#爬虫调度器
class spider_run():
def __init__(self,url,headers,data_path):
self.url=url
self.headers=headers
self.urlmagen=url_magen()
# xxx().xxx
self.htmldownload=html_download
self.htmlanalysis=html_analysis
self.datasave=data_save
self.data_path=data_path
def mkdir(self,path_dict):
'''
:syc
:param path_dict: 智联data字典
:创建智联data文件夹
'''
for i in path_dict.keys():
if r'/' in i:
i_s=re.sub(r'/',r'-',i)
else:
i_s = i
if not os.path.exists('%s/%s'%(self.data_path,i_s)):
for j in path_dict[i]:
if not os.path.exists('%s/%s/%s' % (self.data_path, i_s,j)):
os.makedirs('%s/%s/%s'%(self.data_path,i_s,j))
else:
print('%s/%s已存在' %(i_s,j))
else:
print('%s已存在'%i)
# def realize_class(self):
def run(self):
html_t=self.htmldownload(self.url,self.headers).download()
urls,zl_dict=self.htmlanalysis(html_t).url_analysis()
#创建智联data文件夹
# print(zl_dict.keys())
self.mkdir(zl_dict)
self.urlmagen.urls_set(urls)
for url in self.urlmagen.url_get():
json_t = self.htmldownload(url,self.headers).download()
data = self.htmlanalysis(json_t).data_analysis()
self.datasave(data,self.data_path,url).dsave()
最终运行
if __name__ == '__main__':
headers={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Connection":"keep-alive",
"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0",
}
url='https://www.zhaopin.com/'
data_path='/home/syc/Python/python_pc/data/zhilian_data'
spiderrun=spider_run(url,headers,data_path)
spiderrun.run()