2019-01-06

•scrapy默认的是get请求,当网页是post请求的时候需要重写start_requests方法,重构起始url请求需要浏览器--参数的数据请求 # windoms运行的时候如果出现了以下错误: """UnicodeEncodeError: 'gbk' codec can't encode character '\u2764' in position 261: illegal multibyte sequence""" # 是由于windows标准输出的默认编码(gbk)无法识别编码格式,解决方法: # 改变标准输出的默认编码 import sys,io sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') # 可以把字符串变成xpath格式的标签 from scrapy.selector import Selector # 移除文本中的标签 from w3lib.html import remove_tags class SjjySpider(scrapy.Spider): name = 'sjjy' allowed_domains = ['jiayuan.com'] start_urls = ['http://search.jiayuan.com/v2/search_v2.php'] def start_requests(self): form_data = { 'sex': 'f', 'key': '', 'stc': '1: 11, 2: 20.28, 23: 1', 'sn': 'default', 'sv': '1', 'p': '2', 'f': 'search', 'listStyle': 'bigPhoto', 'pri_uid': '0', 'jsversion': 'v5' } # formdata:对应的表单数据 for url in self.start_urls: yield scrapy.FormRequest( url, formdata=form_data, meta={'form_data': form_data} , dont_filter=True) def parse(self, response): # print(response.status) print(response.text) # 匹配json,转化成字典 pattern = re.compile('##jiayser##(.*?)##jiayser##', re.S) result = re.findall(pattern, response.text)[0] data = json.loads(result) for userinfo in data['userInfo']: # print(userinfo) item = ShijijiayuanItem() # uid item['uid'] = userinfo['uid'] # 头像 item['header_img'] = userinfo['image'] # 性别 item['sex'] = userinfo['sex'] # remove_tags移出标签 item['randTag'] = remove_tags(userinfo['randTag']) # 年龄 item['age'] = userinfo['age'] # 身高 item['height'] = userinfo['height'] # 个性签名 item['shortnote'] = userinfo['shortnote'] # 工作地点 item['workAddress'] = userinfo['work_location'] # 对另一半要求 item['mathCtion'] = userinfo['matchCondition'] # 匿名名 item['nickname'] = userinfo['nickname'] print(item) yield item # 发起下一页 form_data = response.meta['form_data'] # print(from_data) cur_page = form_data['p'] next_page = int(cur_page) + 1 pageTotal = int(data['pageTotal']) if next_page < pageTotal: form_data['p'] = str(next_page) print(cur_page) yield scrapy.FormRequest('http://search.jiayuan.com/v2/search_v2.php',formdata=form_data,meta={'form_data': form_data},callback=self.parse) 作者:牛耀 链接:https://www.jianshu.com/p/99ef8fcc4b0a

你可能感兴趣的:(2019-01-06)