scrapy ,Selenium 企查查模拟登录

middlewares.py文件

**#更换User-agent ,读取cookie**
import time
from selenium import webdriver
from scrapy.http import HtmlResponse, Headers

from selenium.webdriver.chrome.options import Options
class UAMiddleware(object):

    user_agent_list = settings['UAPOOL']


    def process_request(self,request,spider):
        print("((((((((((()))))))))))))))))))))))))))))))))))))))))))))))))")
        if request.url == "https://www.qichacha.com/user_login?back=%2F":
            return HtmlResponse(url=request.url, encoding="utf-8")

        ua = random.choice(self.user_agent_list)
        with open('E:cookies.json', 'r', encoding='utf-8') as fp:# 读取login保存的cookies值
            listcookies = json.loads(fp.read())
        cookies_dict = '' # 通过构建字典类型的cookies
        for cookie in listcookies:
            sss=cookie['name']+"="+cookie['value']+";"
            cookies_dict=cookies_dict+sss
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@111111")
        print(cookies_dict)
        #cookies_dict2=json.dumps(cookies_dict)
        hhh = {
            'Host': 'www.qichacha.com',
            'User-Agent': ua,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Cache-Control": "max-age= 0",
            'Cookie': cookies_dict}
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
        request.headers = Headers(hhh)

        #return scrapy.http.HtmlResponse(url=request.url,  encoding='utf-8',request=request)


****#模拟登录****
from PIL import Image,ImageEnhance
import scrapy
import json
import re
import random
import urllib.parse

import scrapy,time
import urllib.parse
from  openpyxl import  Workbook
from openpyxl  import load_workbook
from selenium import webdriver
wb = Workbook()
ws = wb.active

import pytesseract
from selenium.webdriver import ActionChains
class JSPageMiddleware(object):


	#通过chrome 动态访问,滑块移动函数(未用)
	   def get_strack(self):
	       distance = 348
	       track = []
	       current = 0
	       mid = distance * 3/ 5
	       t = 0.2
	       v = 0
	       while current < distance:
	           if current < mid:
	               a = 3
	           else:
	               a = 6
	           v0 = v
	           v = v0 + a * t
	           move = v0 * t + 1 / 2 * a * t * t
	           print(move)
	           print('----------------------------')
	           current += move
	           print(current)
	           print('+++++++++++++++++++')
	
	           track.append(round(move))
	       print(track)
	       return track

**#模拟登录,保存cookie**
	    def process_request(self, request, spider):
	        #chrome_options = Options()
	        # chrome_options.add_argument('--headless')  # 使用无头谷歌浏览器模式
	        # chrome_options.add_argument('--disable-gpu')
	        # chrome_options.add_argument('--no-sandbox')


        if request.url=="https://www.qichacha.com/user_login?back=%2F" :
            chrome_options = Options()
            # 指定谷歌浏览器路径
            self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path="F:chromedriver.exe")

            self.browser.get(request.url)
            # time.sleep(3)

            self.browser.find_element_by_xpath('//*[@id = "normalLogin"]').click()  # 转到登录界面
            self.browser.find_element_by_xpath(' // *[ @ id = "nameNormal"]').send_keys("13080024006")  # 账号
            self.browser.find_element_by_xpath('// *[ @ id = "pwdNormal"]').send_keys("13080024006")  # 密码
            time.sleep(3)
            button = self.browser.find_element_by_xpath('//*[@id="nc_1__scale_text"]/span')
            # 开始拖动 perform()用来执行ActionChains中存储的行为
            action = ActionChains(self.browser)  # 实例化一个action对象
            action.click_and_hold(button).perform()  # 鼠标左键按下不放
            action.reset_actions()  # 清除之前的action
            # print(distance)
            track = self.get_strack()
            ttt = [23, 81, 224]
            a = 0
            for i in ttt:
                action.move_by_offset(xoffset=i, yoffset=0).perform()  # 移动滑块
                action.reset_actions()
                time.sleep(0.4)
                a = a + i
            action.release().perform()
            time.sleep(5)
            # 滑块,有刷新,有验证码
            # 取刷新,
            alert1=self.browser.find_element_by_class_name('nc-lang-cnt')
            alert = self.browser.find_element_by_class_name('nc-lang-cnt').text
            print("AAAAAAAAAAAA")
            print(alert)
            print("AAAAAAAAAAAAAAAAAAAAAA")
            if alert == "哎呀,出错了,点击刷新再来一次":
                while 1:

                    # 点击刷新
                    while True:
                        self.browser.find_element_by_xpath('//*[@id="dom_id_one"]/div/span/a').click()

                        if button:  # 出现滑块,停止
                            print("CCCCCCCCCCCCC")
                            break
                    # 滑块

                    button = self.browser.find_element_by_xpath('//*[@id="nc_1__scale_text"]/span')
                    action.click_and_hold(button).perform()  # 鼠标左键按下不放
                    action.reset_actions()  # 清除之前的action
                    for i in ttt:
                        action.move_by_offset(xoffset=i, yoffset=0).perform()  # 移动滑块
                        action.reset_actions()
                        time.sleep(0.4)
                        a = a + i
                    action.release().perform()
                    time.sleep(5)

                    alert = self.browser.find_element_by_class_name('nc-lang-cnt').text
                    if alert != "哎呀,出错了,点击刷新再来一次":
                        print("RRRRRRRRRRRRRRRRRRRRRR")
                        break
                if alert == "请在下方输入验证码":
                    print("WWWWWWWWWWWWWWWWWWWWWWWWWWWWW")


                    # 验证码
                    while 1:
                        self.browser.save_screenshot(r"E:aa.png")  # 截取登录页面
                        # 定位验证码位置及大小
                        location = self.browser.find_element_by_xpath(
                            '//*[@id="nc_1__imgCaptcha_img"]/img').location  # 获取验证码x,y轴坐标
                        size = self.browser.find_element_by_xpath(
                            '//*[@id="nc_1__imgCaptcha_img"]/img').size  # 获取验证码的长宽
                        coderange = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                                     int(location['y'] + size['height']))  # 计算验证码整体坐标
                        img = Image.open(r"E:aa.png").crop(coderange)  # 打开截图, 使用Image的crop函数,从截图中再次截取我们需要的区域
                        # 下面对图片做了一些处理,能更好识别一些,相关处理再百度看吧
                        img = img.convert('RGBA')  # 转换模式:L | RGB
                        img = img.convert('L')  # 转换模式:L | RGB
                        img = ImageEnhance.Contrast(img)  # 增强对比度
                        img = img.enhance(2.0)  # 增加饱和度
                        img.save("E:bb.png")
                        # 再次读取识别验证码
                        img = Image.open("E:bb.png")
                        time.sleep(3)
                        text = pytesseract.image_to_string(img).strip()  # 使用image_to_string识别验证码# 打印识别的验证码
                        print(text)
                        # 识别出来验证码去特殊符号,用到了正则表达式,这是我第一次用,之前也没研究过,所以用的可能粗糙,请见谅
                        b = ''
                        for i in text:
                            pattern = re.compile(r'[a-zA-Z0-9]')
                            m = pattern.search(i)
                            if m != None:
                                b += i
                        # 输出去特殊符号以后的验证码
                        print(b)

                        time.sleep(2)
                        if b == '':
                            print("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
                            time.sleep(2)
                            self.browser.find_element_by_xpath('//*[@id="nc_1__btn_1"]').click()  # 点击刷新验证码
                        else:
                            # 把b的值输入验证码输入框
                            time.sleep(10)
                            # spider.browser.find_element_by_xpath('//*[@id="nc_1_captcha_input"]').send_keys( b)  # 输入验证码
                            self.browser.find_element_by_xpath(
                                ' // *[ @ id = "nc_1_scale_submit"] / span').click()  # 点击提交
                        time.sleep(2)

                        if self.browser.find_element_by_class_name('nc-lang-cnt').text == "验证通过":  # 出现验证通过,停止
                            break
                        print("SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS")
                    self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click()  # 点击登录
                    time.sleep(5)

                    print("TTTTTTTTTTTTT111111111111111111111111111111111")


                    self.browser.find_element_by_xpath(
                        '//*[@id="bindwxModal"]/div/div/div/button/span[1]').click()  # 点×
                    dictCookies = self.browser.get_cookies()
                    print(dictCookies)
                    jsonCookies = json.dumps(dictCookies)
                    # 登录完成后,将cookies保存到本地文件
                    with open("E:cookies.json", "w") as fp:
                        fp.write(jsonCookies)
                    print("TTTTTTTTTTTTTTTTTTT22222222222222222222")



            # 滑块,无刷新,验证码
            elif alert == '请在下方输入验证码':
                while 1:  # 验证码
                    self.browser.save_screenshot(r"E:aa.png")  # 截取登录页面
                    # 定位验证码位置及大小
                    location = self.browser.find_element_by_xpath(
                        '//*[@id="nc_1__imgCaptcha_img"]/img').location  # 获取验证码x,y轴坐标
                    size = self.browser.find_element_by_xpath('//*[@id="nc_1__imgCaptcha_img"]/img').size  # 获取验证码的长宽
                    coderange = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
                                 int(location['y'] + size['height']))  # 计算验证码整体坐标
                    img = Image.open(r"E:aa.png").crop(coderange)  # 打开截图, 使用Image的crop函数,从截图中再次截取我们需要的区域
                    # 下面对图片做了一些处理,能更好识别一些,相关处理再百度看吧
                    img = img.convert('RGBA')  # 转换模式:L | RGB
                    img = img.convert('L')  # 转换模式:L | RGB
                    img = ImageEnhance.Contrast(img)  # 增强对比度
                    img = img.enhance(1.0)  # 增加饱和度
                    img.save("E:bb.png")
                    # 再次读取识别验证码
                    img = Image.open("E:bb.png")
                    time.sleep(3)
                    text = pytesseract.image_to_string(img).strip()  # 使用image_to_string识别验证码# 打印识别的验证码
                    print(text)
                    # 识别出来验证码去特殊符号,用到了正则表达式,这是我第一次用,之前也没研究过,所以用的可能粗糙,请见谅
                    b = ''
                    for i in text:
                        pattern = re.compile(r'[a-zA-Z0-9]')
                        m = pattern.search(i)
                        if m != None:
                            b += i
                    # 输出去特殊符号以后的验证码
                    print("QQQQQQQQQQQQQQQQQQQQQ")
                    print(b)
                    print("QQQQQQQQQQQQQQQQQQQQQ")
                    time.sleep(2)
                    if b == '':
                        self.browser.find_element_by_xpath('//*[@id="nc_1__btn_1"]').click()  # 点击刷新验证码
                    else:
                        # 把b的值输入验证码输入框
                        time.sleep(10)
                        # spider.browser.find_element_by_xpath('//*[@id="nc_1_captcha_input"]').send_keys(b)  # 输入验证码
                        self.browser.find_element_by_xpath(' // *[ @ id = "nc_1_scale_submit"] / span').click()  # 点击提交
                    time.sleep(2)

                    if self.browser.find_element_by_class_name('nc-lang-cnt').text == "验证通过":  # 验证通过,停止
                        break
                self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click()  # 点击登录
                time.sleep(5)

                print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")

                # self.browser.find_element_by_xpath('/ html / body / section / div / div / div / p[3] / a').click()
                self.browser.find_element_by_xpath('//*[@id="bindwxModal"]/div/div/div/button/span[1]').click()  # 点×
                dictCookies = self.browser.get_cookies()
                print(dictCookies)
                jsonCookies = json.dumps(dictCookies)
                # 登录完成后,将cookies保存到本地文件
                with open("E:cookies.json", "w") as fp:
                    fp.write(jsonCookies)
                print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")

            # 滑块直接登录
            else:
                self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click()  # 点击登录
                time.sleep(5)

                # self.browser.find_element_by_xpath('/ html / body / section / div / div / div / p[3] / a').click()
                self.browser.find_element_by_xpath('//*[@id="bindwxModal"]/div/div/div/button/span[1]').click()  # 点×
                dictCookies = self.browser.get_cookies()
                print(dictCookies)
                jsonCookies = json.dumps(dictCookies)
                # 登录完成后,将cookies保存到本地文件
                with open("E:cookies.json", "w") as fp:
                    fp.write(jsonCookies)

#settings.py文件

DOWNLOADER_MIDDLEWARES = {
#'ScrapyDemo.middlewares.ScrapydemoDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
#'ScrapyDemo.uamid.Uamid': 555,
 'ScrapyDemo.middlewares.UAMiddleware':543, #用来更换useragent
 #'ScrapyDemo.middlewares.ProxyMiddleware':544,#用来更换代理ip(不好用)
  'ScrapyDemo.middlewares.JSPageMiddleware':540,#与,middlewares对应 用来实现自动登入
 'scrapy.downloadermiddlewares.retry.RetryMiddleware':None,

}

spider.py文件

	  import scrapy
import urllib.parse
	
import scrapy,time
import urllib.parse
from ScrapyDemo.items import QiccItem
from scrapy.selector import Selector
from  openpyxl import  Workbook
from openpyxl  import load_workbook
from selenium import webdriver
wb = Workbook()
ws = wb.active
from selenium.webdriver.chrome.options import Options

class QichachaSpider(scrapy.Spider):
	    name = 'Qichacha'
	    allowed_domains = ['qichacha.com']


	    #custom_settings = {'ITEM_PIPELINES': {'ScrapyDemo.pipelines.QiccPipline': 300}}
	
	    x = 1
	
	    def start_requests(self):
	        print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
	        yield scrapy.Request(url='https://www.qichacha.com/user_login?back=%2F', callback=self.parse,dont_filter=True)
	    def parse(self, response):
	        print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")
	        # 查询公司
	        f = ["深圳市腾讯计算机系统有限公司",
	        "阿里巴巴"]
	
	
	        #      wb = load_workbook('F:\\text.xlsx')
	        # for cell in wb.active['A']:
	        #     if cell == wb.active['A1']:  # 从文件第二行开始读取
	        #         continue
	        #     company = urllib.parse.quote(cell.value).replace('\n', "")
	        #     url = 'https://www.qichacha.com/search?key=' + company
	        #     time.sleep(5)
	
	
	        for link in f:
	            print(link)
	            company = urllib.parse.quote(link)
	            print(company)
	            url = 'https://www.qichacha.com/search?key='+ company
	            print(url)
	            time.sleep(10)
	            yield scrapy.Request(url, callback=self.parse1, errback=self.errparse1,dont_filter=True)
	
	    def errparse1(self, err):
	        print("errparse111111111111111")
	
	    def parse1(self, response):
	        # 提取列表中第一个公司,进入该页
	        print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
	
	        # print(response.text)
	        # print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
	        link = response.xpath('//tbody/tr[1]/td[3]/a/@href').extract_first()
	        if link:
	            detail_link = response.urljoin(link)
	            print("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
	            print(detail_link)
	            print("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
	            time.sleep(2)
	            yield scrapy.Request(url=detail_link, callback=self.parse2, dont_filter=True, errback=self.errparse2)
	        else:
	            print("ERRORERRORERRORERRORERRORparse ERROR")
	    def errparse2(self,err):
	        print("errparse222222222222222222")
	    def parse2(self, response):
	        print("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
	        item = QiccItem()
	        sel = Selector(response)
	        item['name'] = sel.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first() # 公司名称
	        Official_website = sel.xpath('//div[@class="dcontent"]/div/span/a/@href').extract_first() # 公司官网
	        if Official_website:
	            item['Official_website'] = Official_website
	        else:
	            item['Official_website'] = '暂无'
	        item['Listed_company'] = sel.xpath('//header/div[2]/div[1]/a/h2/text()').extract_first()  # 上市公司
	        if item['Listed_company']=='基本信息': # 非上市公司
	            print(item['Listed_company'])
	            item['registered_capital'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[2]/text()').extract_first() #注册资本
	            item['company_type'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[2]/text()').extract_first() #公司类型
	            item['staff_size'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[2]/text()').extract_first() #人员规模
	            yield item
	            print(item)
	        else:
	            print(item['Listed_company']) # 上市公司
	            base = response.xpath('//header/div[2]/div[2]/a/@href').extract_first()
	            #print("yyyyyyyyyyy")
	            #print(base)
	            detail_base = response.urljoin(base)
	            #print("eeeeeeeeeeeeeeeee")
	            #print(detail_base)
	            time.sleep(2)
	            yield scrapy.Request(detail_base, meta={'items':item},callback=self.parse3,errback=self.errparse3)

你可能感兴趣的:(爬虫)