爬虫大战之智联data

爬虫----智联数据获取

代码时间:2018-12-25

语言:python

库:requests,lxml   平台:linux

这段代码很简单,并没有使用代理池,也没有设置爬取延迟,获取数据还是很快的,获取大约2w条招聘信息;

话不多说,来看一下我的代码:

url管理器

#url管理器
class url_magen():
    def __init__(self):
        self.newurl=set()
        self.oldurl=set()
    #URL set
    def url_set(self,url):
        if url not in self.oldurl and url != '':
            self.newurl.add(url)
        else:
            print('URL违规')
    def urls_set(self,urls):
        for url in urls:
            self.url_set(url)
    #URL judge
    def url_judge(self):
        if len(self.newurl):
            return True
        else:
            return False
    #URL get 生成器
    def url_get(self):
        if self.url_judge():
            url=self.newurl.pop()
            yield url
        else:
            return

html下载器

#html 下载
class html_download():
    def __init__(self,url,headers):
        self.url=url
        self.headers=headers
    def download(self):
        req=requests.get(self.url,headers=self.headers)
        req.encoding='utf-8'
        return req.text

html解析器

#html解析器
class html_analysis():
    def __init__(self,text_t):
        self.text_t=text_t
        self.zl_dict={}
        self.urls=set()
    def url_analysis(self):
        html_t=html.etree.HTML(self.text_t)
        Type_A=html_t.xpath('//div[@class="zp-jobNavigater__pop--title"]/text()')
        for i in Type_A:
            Type_B=html_t.xpath('//div[text()="%s"]/following-sibling::div/a/text()'%i)
            self.zl_dict[i]=Type_B
            for j in Type_B:
                self.urls.add('https://fe-api.zhaopin.com/c/i/sou?pageSize=100&cityId=489&kw=%s&kt=1'%j)
        return self.urls,self.zl_dict
    def data_analysis(self):
        json_t = json.loads(self.text_t)
        return json_t

#数据存储器

#数据储存器
class data_save():
    def __init__(self,data,save_path,url_s):
        self.data=data
        self.save_path=save_path
        self.url_s=url_s
    def dsave(self):
        gz_infos=self.data['data']['results']

        for now_path,dirs,file in os.walk(self.save_path):
            if now_path != self.save_path:
                n = 0
                for gz_info in gz_infos:
                    with open('%s/%s.txt'%(now_path,n),'a') as f:
                        f.write('url:%s\n'%self.url_s)
                        f.write('time:%s\n'%time.strftime('%Y%m%d'))
                        f.write('#syc')
                        f.write(str(gz_info))
                        n=n+1

#爬虫调度器

#爬虫调度器
class spider_run():
    def __init__(self,url,headers,data_path):
        self.url=url
        self.headers=headers
        self.urlmagen=url_magen()
        # xxx().xxx
        self.htmldownload=html_download
        self.htmlanalysis=html_analysis
        self.datasave=data_save
        self.data_path=data_path
    def mkdir(self,path_dict):
        '''
        :syc
        :param path_dict: 智联data字典
        :创建智联data文件夹
        '''
        for i in path_dict.keys():
            if r'/' in i:
                i_s=re.sub(r'/',r'-',i)
            else:
                i_s = i
            if not os.path.exists('%s/%s'%(self.data_path,i_s)):
                for j in path_dict[i]:
                    if not os.path.exists('%s/%s/%s' % (self.data_path, i_s,j)):
                        os.makedirs('%s/%s/%s'%(self.data_path,i_s,j))
                    else:
                        print('%s/%s已存在' %(i_s,j))
            else:
                print('%s已存在'%i)
    # def realize_class(self):
    def run(self):
        html_t=self.htmldownload(self.url,self.headers).download()
        urls,zl_dict=self.htmlanalysis(html_t).url_analysis()
        #创建智联data文件夹
        # print(zl_dict.keys())
        self.mkdir(zl_dict)

        self.urlmagen.urls_set(urls)
        for url in self.urlmagen.url_get():
            json_t = self.htmldownload(url,self.headers).download()
            data = self.htmlanalysis(json_t).data_analysis()
            self.datasave(data,self.data_path,url).dsave()

最终运行

if __name__ == '__main__':
    headers={
                "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":"zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
                "Connection":"keep-alive",
                "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0",
    }
    url='https://www.zhaopin.com/'
    data_path='/home/syc/Python/python_pc/data/zhilian_data'
    spiderrun=spider_run(url,headers,data_path)
    spiderrun.run()

你可能感兴趣的:(python,爬虫,python,爬虫)