一个1000元的爬虫外包项目,三种爬虫模式给你轻松做出来

前言
本文的文字及图片来源于网络,仅供学习、交流使用,不具有任何商业用途,如有问题请及时联系我们以作处理。

对于这个外包给予的网站信息,那么我们就爬取选择第一个吧<猎聘网>,也就简单实现,并不会进行打包处理,以及输入关键字爬取。

本篇文章就使用三种爬虫模式爬取相关数据

1、常规爬取数据

2、多线程爬取数据

3、scrapy框架爬取数据

基本开发环境
Python 3.6
Pycharm
相关模块的使用
常规爬取数据

import requests
import parsel
import csv

多线程爬取数据

import requests
import parsel
import csv
import threading

scrapy框架爬取数据

import scrapy
import csv

目标网页分析
爬取python招聘信息数据

数据获取:

1、标题

2、薪资

3、所在城市

4、学历要求

5、工作经验要求

6、公司名字

7、公司福利

8、公司融资情况

9、简历发布时间

该网页是比较简单的,静态网页没有什么可以过多的分析,还是比较简单的。

1、模拟浏览器请求网页,获取网页源代码数据

2、解析网页源代码,提取想要的数据内容

3、将提取的数据内容保存成csv文件,或者其他形式

都说比较简单了,那为什么这个外包还价值1000呢?难道外包赚钱真的这么简单么。是不难,但是不意味着1K的外包就很好赚,毕竟别人只是简单的给出几个网站,首先看你是否能爬取其中的数据,其次甲方的要求肯定不至于此。数据量也不简单。所以今天就以三个版本的爬虫爬取数据。

外包的价格高低因素:

任务的难易程度
爬取的数据量
是否紧急需要
是否需要源码
后期是否需要更新代码

常规爬虫代码

复制代码
import requests
import parsel
import csv

f = open(‘data.csv’, mode=‘a’, encoding=‘utf-8’, newline=’’)
csv_writer = csv.DictWriter(f, fieldnames=[‘标题’, ‘薪资’, ‘城市’,
‘学历’, ‘工作经验’, ‘公司名字’,
‘融资情况’, ‘公司福利’, ‘招聘时间’,
‘简历反馈时间’
])
csv_writer.writeheader()

for page in range(0, 10):
url = ‘https://www.liepin.com/zhaopin/’
params = {
‘compkind’: ‘’,
‘dqs’: ‘’,
‘pubTime’: ‘’,
‘pageSize’: ‘40’,
‘salary’: ‘’,
‘compTag’: ‘’,
‘sortFlag’: ‘’,
‘degradeFlag’: ‘0’,
‘compIds’: ‘’,
‘subIndustry’: ‘’,
‘jobKind’: ‘’,
‘industries’: ‘’,
‘compscale’: ‘’,
‘key’: ‘python’,
‘siTag’: ‘I-7rQ0e90mv8a37po7dV3Q~fA9rXquZc5IkJpXC-Ycixw’,
‘d_sfrom’: ‘search_fp’,
‘d_ckId’: ‘cd74f9fdbdb63c6d462bad39feddc7f1’,
‘d_curPage’: ‘2’,
‘d_pageSize’: ‘40’,
‘d_headId’: ‘cd74f9fdbdb63c6d462bad39feddc7f1’,
‘curPage’: page,
}
headers = {‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’}
response = requests.get(url=url, params=params, headers=headers)
selector = parsel.Selector(response.text)
lis = selector.css(‘div.job-content div:nth-child(1) ul li’)
for li in lis:
title = li.css(’.job-info h3 a::text’).get().strip()
money = li.css(’.condition span.text-warning::text’).get()
city = li.css(’.condition .area::text’).get()
edu = li.css(’.condition .edu::text’).get()
experience = li.css(’.condition span:nth-child(4)::text’).get()
company = li.css(’.company-name a::text’).get()
financing = li.css(’.field-financing span::text’).get()
temptation_list = li.css(‘p.temptation.clearfix span::text’).getall()
temptation_str = ‘|’.join(temptation_list)
release_time = li.css(‘p.time-info.clearfix time::text’).get()
feedback_time = li.css(‘p.time-info.clearfix span::text’).get()
dit = {
‘标题’: title,
‘薪资’: money,
‘城市’: city,
‘学历’: edu,
‘工作经验’: experience,
‘公司名字’: company,
‘融资情况’: financing,
‘公司福利’: temptation_str,
‘招聘时间’: release_time,
‘简历反馈时间’: feedback_time,
}
csv_writer.writerow(dit)
print(dit)

多线程爬虫

import requests
import parsel
import csv
import threading

f = open(‘data_1.csv’, mode=‘a’, encoding=‘utf-8’, newline=’’)
csv_writer = csv.DictWriter(f, fieldnames=[‘标题’, ‘薪资’, ‘城市’,
‘学历’, ‘工作经验’, ‘公司名字’,
‘融资情况’, ‘公司福利’, ‘招聘时间’,
‘简历反馈时间’
])
csv_writer.writeheader()

def get_response(html_url, p):
headers = {‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’}
response = requests.get(url=html_url, params=p, headers=headers)
return response

def get_parsing(html_data):
selector = parsel.Selector(html_data)
return selector

def main§:
url = ‘https://www.liepin.com/zhaopin/’
html_data = get_response(url, p).text
selector = get_parsing(html_data)
lis = selector.css(‘div.job-content div:nth-child(1) ul li’)
for li in lis:
title = li.css(’.job-info h3 a::text’).get().strip()
money = li.css(’.condition span.text-warning::text’).get()
city = li.css(’.condition .area::text’).get()
edu = li.css(’.condition .edu::text’).get()
experience = li.css(’.condition span:nth-child(4)::text’).get()
company = li.css(’.company-name a::text’).get()
financing = li.css(’.field-financing span::text’).get()
temptation_list = li.css(‘p.temptation.clearfix span::text’).getall()
temptation_str = ‘|’.join(temptation_list)
release_time = li.css(‘p.time-info.clearfix time::text’).get()
feedback_time = li.css(‘p.time-info.clearfix span::text’).get()
dit = {
‘标题’: title,
‘薪资’: money,
‘城市’: city,
‘学历’: edu,
‘工作经验’: experience,
‘公司名字’: company,
‘融资情况’: financing,
‘公司福利’: temptation_str,
‘招聘时间’: release_time,
‘简历反馈时间’: feedback_time,
}
csv_writer.writerow(dit)
print(dit)

if name == ‘main’:
for page in range(0, 10):
params = {
‘compkind’: ‘’,
‘dqs’: ‘’,
‘pubTime’: ‘’,
‘pageSize’: ‘40’,
‘salary’: ‘’,
‘compTag’: ‘’,
‘sortFlag’: ‘’,
‘degradeFlag’: ‘0’,
‘compIds’: ‘’,
‘subIndustry’: ‘’,
‘jobKind’: ‘’,
‘industries’: ‘’,
‘compscale’: ‘’,
‘key’: ‘python’,
‘siTag’: ‘I-7rQ0e90mv8a37po7dV3Q~fA9rXquZc5IkJpXC-Ycixw’,
‘d_sfrom’: ‘search_fp’,
‘d_ckId’: ‘cd74f9fdbdb63c6d462bad39feddc7f1’,
‘d_curPage’: ‘2’,
‘d_pageSize’: ‘40’,
‘d_headId’: ‘cd74f9fdbdb63c6d462bad39feddc7f1’,
‘curPage’: page,
}
main_thread = threading.Thread(target=main, args=(params,))
main_thread.start()

scrapy爬虫框架

items.py

import scrapy

class LiepingwangItem(scrapy.Item):
title = scrapy.Field()
money = scrapy.Field()
city = scrapy.Field()
edu = scrapy.Field()
experience = scrapy.Field()
company = scrapy.Field()
financing = scrapy.Field()
temptation_str = scrapy.Field()
release_time = scrapy.Field()
feedback_time = scrapy.Field()

middlewares.py

class LiepingwangDownloaderMiddleware:
def process_request(self, request, spider):
request.headers.update(
{
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’,
}
)
return Noe

pipelines.py

import csv

class LiepingwangPipeline:
def init(self):
self.file = open(‘data_2.csv’, mode=‘a’, encoding=‘utf-8’, newline=’’)
self.csv_file = csv.DictWriter(self.file, fieldnames=[‘title’, ‘money’, ‘city’, ‘edu’,
‘experience’, ‘company’, ‘financing’, ‘temptation_str’,
‘release_time’, ‘feedback_time’
])
self.csv_file.writeheader()

def process_item(self, item, spider):
    dit = dict(item)
    dit['financing'] = dit['financing'].strip()
    dit['title'] = dit['title'].strip()
    self.csv_file.writerow(dit)
    return item


def spider_closed(self, spider):
    self.file.close()

settings.py

ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
‘liepingwang.middlewares.LiepingwangDownloaderMiddleware’: 543,
}
ITEM_PIPELINES = {
‘liepingwang.pipelines.LiepingwangPipeline’: 300,

爬虫文件

import scrapy

from …items import LiepingwangItem

class ZpinfoSpider(scrapy.Spider):
name = ‘zpinfo’
allowed_domains = [‘liepin.com’]
start_urls = [‘https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python’]

def parse(self, response):
    lis = response.css('div.job-content div:nth-child(1) ul li')
    for li in lis:
        title = li.css('.job-info h3 a::text').get().strip()
        money = li.css('.condition span.text-warning::text').get()
        city = li.css('.condition .area::text').get()
        edu = li.css('.condition .edu::text').get()
        experience = li.css('.condition span:nth-child(4)::text').get()
        company = li.css('.company-name a::text').get()
        financing = li.css('.field-financing span::text').get()
        temptation_list = li.css('p.temptation.clearfix span::text').getall()
        temptation_str = '|'.join(temptation_list)
        release_time = li.css('p.time-info.clearfix time::text').get()
        feedback_time = li.css('p.time-info.clearfix span::text').get()
        yield LiepingwangItem(title=title, money=money, city=city, edu=edu, experience=experience, company=company,
                              financing=financing, temptation_str=temptation_str, release_time=release_time,
                              feedback_time=feedback_time)
    href = response.css('div.job-content div:nth-child(1) a:nth-child(9)::attr(href)').get()
    if href:
        next_url = 'https://www.liepin.com' + href
        yield scrapy.Request(url=next_url, callback=self.parse)

你可能感兴趣的:(python,程序员,爬虫,外包项目)