主题
腾讯社招网站:https://hr.tencent.com/position.php?&start=#a0
职位名称:positionname
职位链接:positionLink
职位类型:positionType
招聘人数:peopleNumber
工作地点:workLocation
发布时间:publishTime
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#职位名称
positionName =scrapy.Field();
#职位链接:
positionLink =scrapy.Field();
#职位类型:
positionType =scrapy.Field();
#招聘人数:
peopleNumber =scrapy.Field();
#工作地点:
workLocation =scrapy.Field();
#发布时间:
publishTime =scrapy.Field();
import scrapy
from Tencent.items import TencentItem;
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['hr.tencent.com']
#start_urls = ['http://tencent.com/']
baseURL='https://hr.tencent.com/position.php?&start=';
#偏移量
offset=0;
start_urls=[baseURL+str(offset)];
def parse(self, response):
node_list=response.xpath("//tr[@class='even'] | //tr[@class='odd']");
for node in node_list:
item=TencentItem();
#提取每个职位的信息,并将编码改为utf-8
item['positionName']=node.xpath("./td[1]/a/text()").extract()[0]#.encode('utf-8');
item['positionLink']=node.xpath("./td[1]/a/@href").extract()[0]#.encode('utf-8');
if len(node.xpath("td[2]/text()")):
item['positionType']=node.xpath("./td[2]/text()").extract()[0]#.encode('utf-8');
else:
item['positionType']='';
item['peopleNumber']=node.xpath("./td[3]/text()").extract()[0]#.encode('utf-8');
item['workLocation']=node.xpath("./td[4]/text()").extract()[0]#encode('utf-8')
item['publishTime'] =node.xpath("./td[5]/text()").extract()[0]#.encode('utf-8');
yield item;
#通过偏移量来采集
#if self.offset<3900:
# self.offset+=10;
# url=self.baseURL+str(self.offset);
# yield scrapy.Request(url,callback=self.parse);
#通过下一页
if len(response.xpath("//a[@class='noactive' and @id='next']"))==0:
url=response.xpath("//a[@id='next']/@href").extract()[0];
yield scrapy.Request(“https://hr.tencent.com/"+url,callback=self.parse);
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json;
class TencentPipeline(object):
def __init__(self):
self.f=open('tencent.json','w');
def process_item(self, item, spider):
#content=json.dumps(dict(item),ensure_ascii=False)+',\n';
content=str(dict(item))+",\n";
self.f.write(content);
return item;
def close_spider(self,spider):
self.f.close();
ROBOTSTXT_OBEY = False
并开启管道
ITEM_PIPELINES = {
'Tencent.pipelines.TencentPipeline': 300,
}
斗鱼主播颜值区图片下载
items.py
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#主播名称
nickname=scrapy.Field();
#图片链接
imagelink=scrapy.Field();
douyu.py
import scrapy
import json;
from Douyu.items import DouyuItem;
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyucdn.com']
#start_urls = ['http://douyucdn.com/']
#http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
baseURL='http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=';
offset=0;
start_urls=[baseURL+str(offset)];
def parse(self, response):
data_list=(json.loads(response.body))['data'];
print(len(data_list));
if len(data_list)==0:
return;
else:
for data in data_list:
#print(data['nickname']);
#print(data['vertical_src']);
item=DouyuItem();
item['nickname']=data['nickname'];
item['imagelink']=data['vertical_src'];
yield item;
#if len(json.loads(response.body)['data']):
self.offset = self.offset+20;
yield scrapy.Request(self.baseURL+str(self.offset),callback=self.parse,dont_filter=True);
Pipelines.py
import scrapy;
from Douyu.settings import IMAGES_STORE as images_store;
from scrapy.contrib.pipeline.images import ImagesPipeline;
from Douyu.items import DouyuItem;
import os;
class DouyuPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
image_link=item['imagelink'];
yield scrapy.Request(image_link);
#[
#(True,
#{'url': 'https://rpic.douyucdn.cn/amrpic-180515/1977639_1126.jpg',
#'path': 'full/ea1e27dfee48e49c029b5bd5aa1a1fd6652ce9cd.jpg',
#'checksum': '91a00aa61465331c42d55c5d96260427'}
#)]
def item_completed(self,results,item,info):
#print(results);
#print('\n');
#取出图片路径的值
image_path=[x["path"] for ok,x in results if ok];
print('****************')
#print(item);
print('****************')
os.rename(images_store+image_path[0],images_store+item['nickname']+'.jpg');
return item;
Settings.py
添加图片保存地址信息
IMAGES_STORE=“/Users/zingfront/Documents/newdir/learn_scrapy/Douyu/Images/";
添加user-agent
USER_AGENT = ‘okhttp/3.8.1'
开启管道
ITEM_PIPELINES = {
#'scrapy.contrib.pipeline.images.ImagesPipeline': 1,
'Douyu.pipelines.DouyuPipeline': 300,
}