middlewares.py文件
**#更换User-agent ,读取cookie**
import time
from selenium import webdriver
from scrapy.http import HtmlResponse, Headers
from selenium.webdriver.chrome.options import Options
class UAMiddleware(object):
user_agent_list = settings['UAPOOL']
def process_request(self,request,spider):
print("((((((((((()))))))))))))))))))))))))))))))))))))))))))))))))")
if request.url == "https://www.qichacha.com/user_login?back=%2F":
return HtmlResponse(url=request.url, encoding="utf-8")
ua = random.choice(self.user_agent_list)
with open('E:cookies.json', 'r', encoding='utf-8') as fp:# 读取login保存的cookies值
listcookies = json.loads(fp.read())
cookies_dict = '' # 通过构建字典类型的cookies
for cookie in listcookies:
sss=cookie['name']+"="+cookie['value']+";"
cookies_dict=cookies_dict+sss
print("@@@@@@@@@@@@@@@@@@@@@@@@@@111111")
print(cookies_dict)
#cookies_dict2=json.dumps(cookies_dict)
hhh = {
'Host': 'www.qichacha.com',
'User-Agent': ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Language": "zh-CN,zh;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age= 0",
'Cookie': cookies_dict}
print("@@@@@@@@@@@@@@@@@@@@@@@@@@")
request.headers = Headers(hhh)
#return scrapy.http.HtmlResponse(url=request.url, encoding='utf-8',request=request)
****#模拟登录****
from PIL import Image,ImageEnhance
import scrapy
import json
import re
import random
import urllib.parse
import scrapy,time
import urllib.parse
from openpyxl import Workbook
from openpyxl import load_workbook
from selenium import webdriver
wb = Workbook()
ws = wb.active
import pytesseract
from selenium.webdriver import ActionChains
class JSPageMiddleware(object):
#通过chrome 动态访问,滑块移动函数(未用)
def get_strack(self):
distance = 348
track = []
current = 0
mid = distance * 3/ 5
t = 0.2
v = 0
while current < distance:
if current < mid:
a = 3
else:
a = 6
v0 = v
v = v0 + a * t
move = v0 * t + 1 / 2 * a * t * t
print(move)
print('----------------------------')
current += move
print(current)
print('+++++++++++++++++++')
track.append(round(move))
print(track)
return track
**#模拟登录,保存cookie**
def process_request(self, request, spider):
#chrome_options = Options()
# chrome_options.add_argument('--headless') # 使用无头谷歌浏览器模式
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--no-sandbox')
if request.url=="https://www.qichacha.com/user_login?back=%2F" :
chrome_options = Options()
# 指定谷歌浏览器路径
self.browser = webdriver.Chrome(chrome_options=chrome_options, executable_path="F:chromedriver.exe")
self.browser.get(request.url)
# time.sleep(3)
self.browser.find_element_by_xpath('//*[@id = "normalLogin"]').click() # 转到登录界面
self.browser.find_element_by_xpath(' // *[ @ id = "nameNormal"]').send_keys("13080024006") # 账号
self.browser.find_element_by_xpath('// *[ @ id = "pwdNormal"]').send_keys("13080024006") # 密码
time.sleep(3)
button = self.browser.find_element_by_xpath('//*[@id="nc_1__scale_text"]/span')
# 开始拖动 perform()用来执行ActionChains中存储的行为
action = ActionChains(self.browser) # 实例化一个action对象
action.click_and_hold(button).perform() # 鼠标左键按下不放
action.reset_actions() # 清除之前的action
# print(distance)
track = self.get_strack()
ttt = [23, 81, 224]
a = 0
for i in ttt:
action.move_by_offset(xoffset=i, yoffset=0).perform() # 移动滑块
action.reset_actions()
time.sleep(0.4)
a = a + i
action.release().perform()
time.sleep(5)
# 滑块,有刷新,有验证码
# 取刷新,
alert1=self.browser.find_element_by_class_name('nc-lang-cnt')
alert = self.browser.find_element_by_class_name('nc-lang-cnt').text
print("AAAAAAAAAAAA")
print(alert)
print("AAAAAAAAAAAAAAAAAAAAAA")
if alert == "哎呀,出错了,点击刷新再来一次":
while 1:
# 点击刷新
while True:
self.browser.find_element_by_xpath('//*[@id="dom_id_one"]/div/span/a').click()
if button: # 出现滑块,停止
print("CCCCCCCCCCCCC")
break
# 滑块
button = self.browser.find_element_by_xpath('//*[@id="nc_1__scale_text"]/span')
action.click_and_hold(button).perform() # 鼠标左键按下不放
action.reset_actions() # 清除之前的action
for i in ttt:
action.move_by_offset(xoffset=i, yoffset=0).perform() # 移动滑块
action.reset_actions()
time.sleep(0.4)
a = a + i
action.release().perform()
time.sleep(5)
alert = self.browser.find_element_by_class_name('nc-lang-cnt').text
if alert != "哎呀,出错了,点击刷新再来一次":
print("RRRRRRRRRRRRRRRRRRRRRR")
break
if alert == "请在下方输入验证码":
print("WWWWWWWWWWWWWWWWWWWWWWWWWWWWW")
# 验证码
while 1:
self.browser.save_screenshot(r"E:aa.png") # 截取登录页面
# 定位验证码位置及大小
location = self.browser.find_element_by_xpath(
'//*[@id="nc_1__imgCaptcha_img"]/img').location # 获取验证码x,y轴坐标
size = self.browser.find_element_by_xpath(
'//*[@id="nc_1__imgCaptcha_img"]/img').size # 获取验证码的长宽
coderange = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
int(location['y'] + size['height'])) # 计算验证码整体坐标
img = Image.open(r"E:aa.png").crop(coderange) # 打开截图, 使用Image的crop函数,从截图中再次截取我们需要的区域
# 下面对图片做了一些处理,能更好识别一些,相关处理再百度看吧
img = img.convert('RGBA') # 转换模式:L | RGB
img = img.convert('L') # 转换模式:L | RGB
img = ImageEnhance.Contrast(img) # 增强对比度
img = img.enhance(2.0) # 增加饱和度
img.save("E:bb.png")
# 再次读取识别验证码
img = Image.open("E:bb.png")
time.sleep(3)
text = pytesseract.image_to_string(img).strip() # 使用image_to_string识别验证码# 打印识别的验证码
print(text)
# 识别出来验证码去特殊符号,用到了正则表达式,这是我第一次用,之前也没研究过,所以用的可能粗糙,请见谅
b = ''
for i in text:
pattern = re.compile(r'[a-zA-Z0-9]')
m = pattern.search(i)
if m != None:
b += i
# 输出去特殊符号以后的验证码
print(b)
time.sleep(2)
if b == '':
print("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
time.sleep(2)
self.browser.find_element_by_xpath('//*[@id="nc_1__btn_1"]').click() # 点击刷新验证码
else:
# 把b的值输入验证码输入框
time.sleep(10)
# spider.browser.find_element_by_xpath('//*[@id="nc_1_captcha_input"]').send_keys( b) # 输入验证码
self.browser.find_element_by_xpath(
' // *[ @ id = "nc_1_scale_submit"] / span').click() # 点击提交
time.sleep(2)
if self.browser.find_element_by_class_name('nc-lang-cnt').text == "验证通过": # 出现验证通过,停止
break
print("SSSSSSSSSSSSSSSSSSSSSSSSSSSSSS")
self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click() # 点击登录
time.sleep(5)
print("TTTTTTTTTTTTT111111111111111111111111111111111")
self.browser.find_element_by_xpath(
'//*[@id="bindwxModal"]/div/div/div/button/span[1]').click() # 点×
dictCookies = self.browser.get_cookies()
print(dictCookies)
jsonCookies = json.dumps(dictCookies)
# 登录完成后,将cookies保存到本地文件
with open("E:cookies.json", "w") as fp:
fp.write(jsonCookies)
print("TTTTTTTTTTTTTTTTTTT22222222222222222222")
# 滑块,无刷新,验证码
elif alert == '请在下方输入验证码':
while 1: # 验证码
self.browser.save_screenshot(r"E:aa.png") # 截取登录页面
# 定位验证码位置及大小
location = self.browser.find_element_by_xpath(
'//*[@id="nc_1__imgCaptcha_img"]/img').location # 获取验证码x,y轴坐标
size = self.browser.find_element_by_xpath('//*[@id="nc_1__imgCaptcha_img"]/img').size # 获取验证码的长宽
coderange = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
int(location['y'] + size['height'])) # 计算验证码整体坐标
img = Image.open(r"E:aa.png").crop(coderange) # 打开截图, 使用Image的crop函数,从截图中再次截取我们需要的区域
# 下面对图片做了一些处理,能更好识别一些,相关处理再百度看吧
img = img.convert('RGBA') # 转换模式:L | RGB
img = img.convert('L') # 转换模式:L | RGB
img = ImageEnhance.Contrast(img) # 增强对比度
img = img.enhance(1.0) # 增加饱和度
img.save("E:bb.png")
# 再次读取识别验证码
img = Image.open("E:bb.png")
time.sleep(3)
text = pytesseract.image_to_string(img).strip() # 使用image_to_string识别验证码# 打印识别的验证码
print(text)
# 识别出来验证码去特殊符号,用到了正则表达式,这是我第一次用,之前也没研究过,所以用的可能粗糙,请见谅
b = ''
for i in text:
pattern = re.compile(r'[a-zA-Z0-9]')
m = pattern.search(i)
if m != None:
b += i
# 输出去特殊符号以后的验证码
print("QQQQQQQQQQQQQQQQQQQQQ")
print(b)
print("QQQQQQQQQQQQQQQQQQQQQ")
time.sleep(2)
if b == '':
self.browser.find_element_by_xpath('//*[@id="nc_1__btn_1"]').click() # 点击刷新验证码
else:
# 把b的值输入验证码输入框
time.sleep(10)
# spider.browser.find_element_by_xpath('//*[@id="nc_1_captcha_input"]').send_keys(b) # 输入验证码
self.browser.find_element_by_xpath(' // *[ @ id = "nc_1_scale_submit"] / span').click() # 点击提交
time.sleep(2)
if self.browser.find_element_by_class_name('nc-lang-cnt').text == "验证通过": # 验证通过,停止
break
self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click() # 点击登录
time.sleep(5)
print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")
# self.browser.find_element_by_xpath('/ html / body / section / div / div / div / p[3] / a').click()
self.browser.find_element_by_xpath('//*[@id="bindwxModal"]/div/div/div/button/span[1]').click() # 点×
dictCookies = self.browser.get_cookies()
print(dictCookies)
jsonCookies = json.dumps(dictCookies)
# 登录完成后,将cookies保存到本地文件
with open("E:cookies.json", "w") as fp:
fp.write(jsonCookies)
print("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT")
# 滑块直接登录
else:
self.browser.find_element_by_xpath('//*[@id="user_login_normal"]/button').click() # 点击登录
time.sleep(5)
# self.browser.find_element_by_xpath('/ html / body / section / div / div / div / p[3] / a').click()
self.browser.find_element_by_xpath('//*[@id="bindwxModal"]/div/div/div/button/span[1]').click() # 点×
dictCookies = self.browser.get_cookies()
print(dictCookies)
jsonCookies = json.dumps(dictCookies)
# 登录完成后,将cookies保存到本地文件
with open("E:cookies.json", "w") as fp:
fp.write(jsonCookies)
#settings.py文件
DOWNLOADER_MIDDLEWARES = {
#'ScrapyDemo.middlewares.ScrapydemoDownloaderMiddleware': 543,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
#'ScrapyDemo.uamid.Uamid': 555,
'ScrapyDemo.middlewares.UAMiddleware':543, #用来更换useragent
#'ScrapyDemo.middlewares.ProxyMiddleware':544,#用来更换代理ip(不好用)
'ScrapyDemo.middlewares.JSPageMiddleware':540,#与,middlewares对应 用来实现自动登入
'scrapy.downloadermiddlewares.retry.RetryMiddleware':None,
}
spider.py文件
import scrapy
import urllib.parse
import scrapy,time
import urllib.parse
from ScrapyDemo.items import QiccItem
from scrapy.selector import Selector
from openpyxl import Workbook
from openpyxl import load_workbook
from selenium import webdriver
wb = Workbook()
ws = wb.active
from selenium.webdriver.chrome.options import Options
class QichachaSpider(scrapy.Spider):
name = 'Qichacha'
allowed_domains = ['qichacha.com']
#custom_settings = {'ITEM_PIPELINES': {'ScrapyDemo.pipelines.QiccPipline': 300}}
x = 1
def start_requests(self):
print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
yield scrapy.Request(url='https://www.qichacha.com/user_login?back=%2F', callback=self.parse,dont_filter=True)
def parse(self, response):
print("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")
# 查询公司
f = ["深圳市腾讯计算机系统有限公司",
"阿里巴巴"]
# wb = load_workbook('F:\\text.xlsx')
# for cell in wb.active['A']:
# if cell == wb.active['A1']: # 从文件第二行开始读取
# continue
# company = urllib.parse.quote(cell.value).replace('\n', "")
# url = 'https://www.qichacha.com/search?key=' + company
# time.sleep(5)
for link in f:
print(link)
company = urllib.parse.quote(link)
print(company)
url = 'https://www.qichacha.com/search?key='+ company
print(url)
time.sleep(10)
yield scrapy.Request(url, callback=self.parse1, errback=self.errparse1,dont_filter=True)
def errparse1(self, err):
print("errparse111111111111111")
def parse1(self, response):
# 提取列表中第一个公司,进入该页
print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
# print(response.text)
# print("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC")
link = response.xpath('//tbody/tr[1]/td[3]/a/@href').extract_first()
if link:
detail_link = response.urljoin(link)
print("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
print(detail_link)
print("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD")
time.sleep(2)
yield scrapy.Request(url=detail_link, callback=self.parse2, dont_filter=True, errback=self.errparse2)
else:
print("ERRORERRORERRORERRORERRORparse ERROR")
def errparse2(self,err):
print("errparse222222222222222222")
def parse2(self, response):
print("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE")
item = QiccItem()
sel = Selector(response)
item['name'] = sel.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first() # 公司名称
Official_website = sel.xpath('//div[@class="dcontent"]/div/span/a/@href').extract_first() # 公司官网
if Official_website:
item['Official_website'] = Official_website
else:
item['Official_website'] = '暂无'
item['Listed_company'] = sel.xpath('//header/div[2]/div[1]/a/h2/text()').extract_first() # 上市公司
if item['Listed_company']=='基本信息': # 非上市公司
print(item['Listed_company'])
item['registered_capital'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[2]/text()').extract_first() #注册资本
item['company_type'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[2]/text()').extract_first() #公司类型
item['staff_size'] = sel.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[2]/text()').extract_first() #人员规模
yield item
print(item)
else:
print(item['Listed_company']) # 上市公司
base = response.xpath('//header/div[2]/div[2]/a/@href').extract_first()
#print("yyyyyyyyyyy")
#print(base)
detail_base = response.urljoin(base)
#print("eeeeeeeeeeeeeeeee")
#print(detail_base)
time.sleep(2)
yield scrapy.Request(detail_base, meta={'items':item},callback=self.parse3,errback=self.errparse3)