模拟登录github # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from scrapy import FormRequest class Login1Spider(scrapy.Spider): name = 'login1' allowed_domains = ['github.com'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://github.com/', 'Content-Type': 'application/x-www-form-urlencoded', } start_urls = ['https://github.com/758109577'] def start_requests(self): urls = ['https://github.com/login'] for url in urls: yield Request(url, meta={'cookiejar': 1}, callback=self.github_login) def github_login(self, response): # 首先获取authenticity_token,这里可以借助scrapy shell ”url“来获取页面 # 然后从源码中获取到authenticity_token的值 authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first() self.logger.info('authenticity_token=' + authenticity_token) # url可以从fiddler抓取中获取,dont_click作用是如果是True,表单数据将被提交,而不需要单击任何元素。 return FormRequest.from_response(response, url='https://github.com/session', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, formdata={'utf8': '✓', 'authenticity_token': authenticity_token, 'login': '[email protected]', 'password': 'xxxxxx'}, callback=self.github_after, dont_click=True, ) def github_after(self, response): # 获取登录页面主页中的字符串'Browse activity' list = response.xpath("//a[@class='UnderlineNav-item selected']/text()").extract() # 如果含有字符串,则打印日志说明登录成功 if 'Browse activity' in list: self.logger.info('我已经登录成功了,这是我获取的关键字:Browse activity') for url in self.start_urls: yield Request(url=url, callback=self.show) def show(self,response): print("############################") list = response.xpath("//span[@class='p-nickname vcard-username d-block']/text()").extract() if 'aaaaaa' in list: print(list) print("############################") else: print("失败")
2.模拟登录51cto
items配置 vim items.py class CtoItem(scrapy.Item): title_url = scrapy.Field() title = scrapy.Field() fullname = scrapy.Field() vim login2.py # -*- coding: utf-8 -*- import scrapy from scrapy.http import FormRequest,Request from ..items import CtoItem class Login2Spider(scrapy.Spider): name = 'login2' allowed_domains = ['51cto.com'] #start_urls = ['http://51cto.com/'] def start_requests(self): urls = ['http://home.51cto.com/index'] for url in urls: yield Request(url,callback=self.cto_login,meta={'cookiejar':1}) def cto_login(self,response): csrf = response.xpath("//input[@name='_csrf']/@value").extract_first() headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.9', 'Accept-Encoding':'gzip, deflate', 'Referer':'http://www.51cto.com/', 'Content-Type':'text/html; charset=UTF-8' } self.logger.info("获取csrf值为 %s" % csrf) yield FormRequest.from_response(response,url='https://blog.51cto.com/haoyonghui?type=1',headers=headers,meta={'cookiejar': response.meta['cookiejar']},formdata={ 'LoginForm[username]':'[email protected]', 'LoginForm[password]':'xxxxxx', 'LoginForm[rememberMe]':'0', '_csrf': csrf, },callback=self.after_login,dont_click=True, ) def after_login(self,response): item = CtoItem() #item = {} resps = response.css('ul.artical-list li') for resp in resps: # 写入item字段中 item['title_url'] = resp.css("a.tit::attr(href)").extract_first() item['title'] = resp.css("a.tit::text").extract_first().strip() # fullname的格式为“[名称](链接)”之所以这样是因为 # markdown语法里这个表示链接的意思,点击名称直接打开链接内容 item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')' # 此处logger也是调试使用 print("###################") self.logger.info("title url的值为:%s , title的值为%s" % (item['title_url'], item['title'])) yield item # 下一页内容获取 next_page = response.css('li.next a::attr(href)').extract_first() # self.logger.info("下一页链接为:%s" % next_page) if next_page is not None: yield Request(next_page, callback=self.after_login) #def parse(self, response): #pass