title: 基于selenium爬取智联招聘及国家企业信用信息公示系统
date: 2019-09-18 09:28:46
categories: 爬虫
tags:
突破加密混淆的js文件,IP封锁,验证码识别(滑动和语序点击并存),useragent检查,多重url拼接cookie
通过获取链接返回的json数据拿到新的页面,selenium进行解析
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
allowed_domains = ['zhaopin.com']
start_urls = ['https://sou.zhaopin.com/']
driver = None
chrome_options = webdriver.ChromeOptions()
# proxy_url = get_random_proxy()
# print(proxy_url + "代理服务器正在爬取")
# chrome_options.add_argument('--proxy-server=https://' + proxy_url.strip())
prefs = {
'profile.default_content_setting_values': {
'images': 1, # 不加载图片
"User-Agent": UserAgent().random, # 更换UA
}
}
chrome_options.add_experimental_option("prefs", prefs)
if platform.system() == "Windows":
driver = webdriver.Chrome('chromedriver.exe', chrome_options=chrome_options)
elif platform.system() == "Linux":
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(
executable_path="/usr/bin/chromedriver",
chrome_options=chrome_options)
wait = WebDriverWait(driver, 15)
def start_requests(self):
data = ["游戏", "期货", "贷款"]
for kw in data:
yield Request(
url="https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=90&cityId=639&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=" + kw + "&kt=3",
meta={"kw": kw},
callback=self.parse_pages) # response获取meta
def parse_pages(self, response):
numtotal = json.loads(response.text)["data"]["count"]
kw = response.meta.get("kw", "游戏")
for i in range(0, numtotal // 90 + 1):
url = "https://fe-api.zhaopin.com/c/i/sou?start=" + str(
90 * i) + "&pageSize=90&cityId=639&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=" + kw + "&kt=3"
yield Request(
url=url,
meta={"kw": kw},
callback=self.parse) # response获取meta
def parse(self, response):
job_list = json.loads(response.text)["data"]["results"]
for job in job_list:
yield Request(url=job["positionURL"], callback=self.parse_detail,
meta={'cookiejar': 'chrome', 'kw': response.meta.get("kw", "")})
def parse_detail(self, response):
print(response.url)
self.driver.get(response.url)
self.driver.refresh()
time.sleep(2)
self.driver.implicitly_wait(20)
dom = etree.HTML(self.driver.page_source)
item = JobItem()
item['recruitment_position'] = null_if(dom.xpath('//*[@class="summary-plane__title"]'))
item['salary'] = null_if(dom.xpath('//*[@class="summary-plane__salary"]'))
item['company_name'] = dom.xpath('//*[@class="company__title"]')[0].text
item['work_experience'] = dom.xpath('//ul[@class="summary-plane__info"]/li[2]')[0].text
item['education_background'] = dom.xpath('//ul[@class="summary-plane__info"]/li[3]')[0].text
item['job_requirements'] = remove_html(
etree.tostring(dom.xpath('//div[@class="describtion__detail-content"]')[0], encoding="utf-8").decode(
'utf-8'))
item['company_info'] = null_if(dom.xpath('//div[@class="company__description"]'))
item['company_address'] = remove_html(
etree.tostring(dom.xpath('//span[@class="job-address__content-text"]')[0], encoding="utf-8").decode(
'utf-8'))
if len(dom.xpath('//div[@class="highlights__content"]')):
item['company_welfare'] = remove_html(etree.tostring(dom.xpath('//div[@class="highlights__content"]')[0], encoding="utf-8").decode('utf-8'))
else:
item['company_welfare'] = '无'
item['id'] = get_md5(self.driver.current_url)
item['keyword'] = response.meta.get("kw", "")
item['url'] = response.url
item['crawl_date'] = datetime.now().strftime("%Y-%m-%d")
yield item
crack.py
class Crack(object):
"""
同一ip频繁使用:
出现正常200但是没有结果
第一次解密出来是错误的
"""
def __init__(self, url, test_url):
path = os.getcwd()
with open(os.path.join(path, "wc_js.js"), encoding='utf-8') as f:
wc_js = f.read()
self.wc_js = execjs.compile(wc_js)
self.url = url
self.test_url = test_url
# 固定user_agent,后台使用user-agent验证cookies, 之后的访问也需要使用这个
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
def acquire_js(self):
"""
不带cookies请求首页,获得返回的js
:return:页面中的js,和set_cookies中的jsluid
"""
response = requests.get(self.url, headers=self.headers)
if response.status_code == 521:
return response.text, response.headers['Set-Cookie'].split('=')[1].split(';')[0]
else:
print(response.text)
print(self.headers)
return None, None
def first_decryption(self, first_js):
"""
解密js,获得第二层加密的js
:param first_js:
:return:
"""
x = re.findall('var x="(.*?)"', first_js)[0]
y = re.findall(',y="(.*?)"', first_js)[0]
second_js = self.wc_js.call('once_js', x, y)
# second_js = self.wc_js.call('get_js', x, y, z)
return second_js
def regex(self, js):
regex = "!*window\[.*?\]"
find = re.findall(regex, js)
if find:
for f in find:
if '!' in f:
if len(re.findall('!', f)) % 2 == 0:
js = js.replace(f, 'false')
else:
js = js.replace(f, 'true')
else:
js = js.replace(f, 'undefined')
js = js.replace('window.headless', 'undefined')
return js
def replace_url(self, js):
# 替换1
# 取出两个变量名
_3d = re.findall("(var .{0,5}=)document\.createElement\('div'\);", js)
_2b = re.findall("(var .{0,5}=).{0,5}\.match\(/https\?:\\\/\\\//\)\[0\];", js)
# 替换成要访问的url
js = re.sub("var .{0,5}=document\.createElement\('div'\);", _3d[0] + f'"{self.url.replace("http://", "")}";',
js)
js = re.sub("_.{0,5}\.innerHTML='';", "", js)
js = re.sub("_.{0,5}=.{0,5}\.firstChild\.href;", "", js)
js = re.sub("var .{0,5}=.{0,5}\.match\(/https\?:\\\/\\\//\)\[0\];", _2b[0] + '"http://";', js)
js = re.sub("_.{0,5}=.{0,5}\.substr\(.{0,5}\.length\)\.toLowerCase\(\);", "", js)
return js
def second_decryption(self, second_js):
"""
把第二层js准换成本地可以运行的js
!!!此处可能会出错!!!
:param second_js: 第一次解密的js
:return: __jsl_clearance的值
"""
# 转义字符
js = second_js.replace('\\\\', '\\')
# 切割
js = 'cookie' + js.split('document.cookie')[1]
js = js.split('GMT;Path=/;')[0] + "'"
if re.findall("(var .{0,5}=)document\.createElement\('div'\);", js):
js = self.replace_url(js)
# 替换可能出现的window
js = self.regex(js)
s = """
function cook() {
%s
return cookie
}
"""
new_js = s % js
ctx = execjs.compile(new_js)
# 切割获得的__jsl_clearance
jsl = ctx.call('cook')
jsl = jsl.split(';')[0]
jsl_clearance = jsl.split('=')[1]
return jsl_clearance
def test_cookies(self, jsluid, jsl_clearance):
"""
带cookies访问,测试拿到的是否正确
:param jsluid:cookies中的参数
:param jsl_clearance: cookies中的参数
:return:
"""
headers = self.headers.copy()
headers['Cookie'] = f'__jsluid_h={jsluid}; __jsl_clearance={jsl_clearance};'
response = requests.get(self.test_url, headers=headers)
print(response.text)
return response.status_code
def run(self):
while True:
first_js, jsluid = self.acquire_js()
second_js = self.first_decryption(first_js)
try:
jsl_clearance = self.second_decryption(second_js)
except:
# print(second_js)
continue
else:
code = self.test_cookies(jsluid, jsl_clearance)
if code == 200:
return jsluid, jsl_clearance
else:
print(code)
# print(second_js)
continue
if __name__ == '__main__':
# # 企业信息公示系统
url = "http://www.gsxt.gov.cn/index.html"
test_url = "http://www.gsxt.gov.cn/index.html"
# # 66代理
# url = "http://www.66ip.cn/2.html"
# test_url = "http://www.66ip.cn/2.html"
# # 公安部网站
# url = 'http://www.mps.gov.cn/'
# test_url = 'http://www.mps.gov.cn/'
ck = Crack(url, test_url)
jsluid, jsl_clearance = ck.run()
print('jsluid:', jsluid)
print('jsl_clearance:', jsl_clearance)
class SearchResultParse(object):
'''查询结果页解析
'''
def __init__(self, pagesource, base_url, parse_rule):
self.selector = etree.HTML(pagesource)
self.url_list = []
self.base_url = base_url
self.parse_rule = parse_rule['search_result_url']
def search_result_parse(self):
self.url_list = [self.base_url + i for i in self.selector.xpath(self.parse_rule)]
return self.url_list
class PageDetailParse(object):
'''详情页解析
'''
def __init__(self, pagesource, parse_rule):
self.selector = etree.HTML(pagesource)
self.parse_rule = parse_rule
self.info_list = {}
def search_result_parse(self, primary_info=None):
if primary_info is None:
primary_info = []
for i in self.parse_rule['primaryinfo']:
primary_info.append(
self.selector.xpath(i).replace("\n", "").replace("\t", "").replace("\r", "").replace(" ", ""))
self.info_list['primary_info'] = primary_info
return self.info_list
class CookieRequest(object):
'''带cookie访问查询结果
'''
def __init__(self, url_list=None):
'''设置requests中的session的cookie
'''
self.url_list = url_list
self.session = requests.Session()
self.result = []
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
}
def cookie_requests(self):
'''带cookie依次访问各个查询结果
'''
url = "http://www.gsxt.gov.cn/index.html"
test_url = "http://www.gsxt.gov.cn/corp-query-entprise-info-hot-search-list.html?province=100000"
ck = Crack(url, test_url)
jsluid, jsl_clearance, JSESSIONID = ck.run()
self.headers['Cookie'] = f'__jsluid_h={jsluid}; __jsl_clearance={jsl_clearance};JSESSIONID={JSESSIONID}'
for url in self.url_list:
response = self.session.get(url=url, headers=self.headers)
self.result.append(response.text)
time.sleep(5)
return self.result
class MaxEnterError(Exception):
'''输入关键字最大尝试次数
'''
def __init__(self, ErrorInfo):
super().__init__(self) # 初始化父类
self.errorinfo = ErrorInfo
def __str__(self):
return self.errorinfo
class GtClickShot(object):
def __init__(self, username, password,soft_id):
'''初始化超级鹰
softid已固化到程序
args:
username(str):超级鹰普通用户名
password(str):超级鹰密码
'''
self.username = username
self.password = md5(password.encode("utf-8")).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""发送图片至打码平台
args:
im(Byte): 图片字节
codetype(str): 题目类型 参考 http://www.chaojiying.com/price.html
return(json):返回打码信息,包含坐标信息,坐标信息用“|”隔开
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""识别错误返回题分
args:
im_id(str):报错题目的图片ID
return(str):报错反馈
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
class CorpSearch(object):
def __init__(self, init_url, index_url, headers, max_click):
'''初始化
args:
init_url:初始化url,加速乐反爬JS要求访问目标网站前需先访问初始化url获取gt和challenge
index_url:目标网站首页url
headers:请求头信息
max_click:最大循环点击次数为了应对点击不灵敏,设置循环检查点击。
self.wait:默认条件等待最大时间
self.click_valitimes:点击验证次数,大于0时需返回题分,等于0时不需要
'''
chrome_options = webdriver.ChromeOptions()
prefs = {
'profile.default_content_setting_values': {
'images': 1, # 加载图片
"User-Agent": UserAgent().random, # 更换UA
}
}
chrome_options.add_experimental_option("prefs", prefs)
self.init_url = init_url
self.index_url = index_url
if platform.system() == "Windows":
self.driver = webdriver.Chrome('chromedriver.exe', chrome_options=chrome_options)
elif platform.system() == "Linux":
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
self.driver = webdriver.Chrome(
executable_path="/usr/bin/chromedriver",
chrome_options=chrome_options)
self.wait = WebDriverWait(self.driver, 50)
self.max_entertimes = max_click
self.click_valitimes = 0
self.action = ActionChains(self.driver)
self.gt_shot = GtClickShot("****", "*****","901554")
self.options = webdriver.ChromeOptions()
self.headers = headers
for option in self.headers:
self.options.add_argument(option)
# 初始化页面,绕过过加速乐反爬,获取gt和challenge,并加载进入首页
def init(self):
'''
请求初始化网站,并进入首页
'''
self.driver.get(self.init_url)
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > pre:nth-child(1)")))
self.driver.get(self.index_url)
# 加载首页,输入查询关键词,点击查询按钮
# 如果点击按钮失效,自动重新回车,并设定最大回车次数,一旦超过设定值,抛出异常,结束程序
def input_query(self, keyword):
'''输入关键词进行查询
args:
keyword:查询关键词
return:
仅用于方法返回
'''
enter_word = self.wait.until(EC.presence_of_element_located((By.ID, "keyword")))
self.wait.until(EC.presence_of_element_located((By.ID, "btn_query")))
time.sleep(random.randint(8, 15) / 10)
enter_word.send_keys(keyword)
time.sleep(random.randint(5, 10) / 10)
enter_word.send_keys(Keys.ENTER)
while True:
if self.max_entertimes == 0:
raise MaxEnterError('---Out of max times on the search enter---')
gt_panel = self.driver.find_element_by_css_selector("body > div.geetest_panel.geetest_wind")
style_value = gt_panel.value_of_css_property("display")
if style_value.strip() == "block":
break
else:
enter_word.send_keys(Keys.ENTER)
time.sleep(random.randint(1, 5) / 10)
self.max_entertimes -= 1
return
# 判断页面中是否包含某个元素,注意是class_name
def is_element_exist(self, class_name):
'''判断某个元素是否存在
args:
class_name:元素class属性名称
return:
存在(True),不存在(False)
'''
try:
self.driver.find_element_by_class_name(class_name)
return True
except:
return False
# 屏幕截图,并将截图内容读入内存,加速计算操作
def get_screenshot(self):
'''屏幕截图
return:
返回截图
'''
screenshot = self.driver.get_screenshot_as_png()
screenshot = Image.open(BytesIO(screenshot))
return screenshot
# 获取验证验证码图片的位置,用于裁图
def get_position(self, pos_img):
'''验证图片的坐标尺寸信息
args:
pos_img:验证码定位点元素
return:
验证码定位点的坐标信息,注意依次为:左底,左高,右高,右底
'''
location = pos_img.location
size = pos_img.size
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
'width']
return (left, top, right, bottom)
# 对于滑块验证码,获取完整的和缺块的验证码图片截图
def get_slide_images(self):
'''获取有缺口和没缺口的图片
'''
canvas_img = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".geetest_canvas_img.geetest_absolute > div")))
position = self.get_position(canvas_img)
befor_screenshot = self.get_screenshot()
befor_img = befor_screenshot.crop(position)
befor_img.save("befor_click.png")
btn_slide = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_slider_button")))
self.action.click_and_hold(btn_slide).perform()
after_screenshot = self.get_screenshot()
after_img = after_screenshot.crop(position)
after_img.save("after_click.png")
# 获取缺口位置,计算滑动距离(灰度化,求差值,阈值去燥,计算缺口位置,计算滑动距离)
def get_slide_distance(self):
'''获取滑动距离
return:
返回滑动距离
'''
befor_click_img = "F:\\Anaconda3\\Lib\\captcha\\gt_validate\\befor_click.png"
after_click_path = "F:\\Anaconda3\\Lib\\captcha\\gt_validate\\after_click.png"
befor_img = cv2.imread(befor_click_img)
after_img = cv2.imread(after_click_path)
befor_gray = cv2.cvtColor(befor_img, cv2.COLOR_BGR2GRAY)
after_gray = cv2.cvtColor(after_img, cv2.COLOR_BGR2GRAY)
img_diff = np.array(befor_gray) - np.array(after_gray)
height, width = img_diff.shape
for i in range(height):
for j in range(width):
if img_diff[i][j] > 245 or img_diff[i][j] < 60:
img_diff[i][j] = 0
start_position = random.choice([4, 5, 6])
reshape_img = img_diff.T
sum_color = list(map(lambda x: sum(x), reshape_img))
for i in range(1, len(sum_color)):
if sum_color[i] > 1000 and i > 60:
end_position = i
break
slide_distance = end_position - start_position
return slide_distance
# 模拟鼠标轨迹,按照开始慢加速(2),中间快加速(5),后面慢加速(2),最后慢减速的方式(1)
# 返回值是x值与Y值坐标以及sleep时间截点,起始中间最后都要sleep
def get_track(self, distance, track_list=None):
'''获取滑动轨迹
args:
distance:滑动距离
kargs:
Track_list:滑动轨迹,初始化为空
return:
滑动轨迹,断点位置(2处)
'''
if track_list is None:
track_list = []
base = distance / 10
x1 = round(base * 2)
x2 = round(base * 5)
x3 = x1
x4 = distance - x1 - x2 - x3
ynoise_num = random.randint(5, 10)
y1 = [random.randint(-2, 2) for _ in range(ynoise_num)]
yrdm = list(set(random.choice(range(distance)) for _ in range(ynoise_num)))
x = [1] * distance
y = [0] * distance
for i, j in enumerate(yrdm):
y[j] = y1[i]
t1 = sorted([random.randint(8, 13) / 1000 for _ in range(x1)], reverse=True)
t2 = sorted([random.randint(1, 8) / 1000 for _ in range(x2)], reverse=True)
t3 = sorted([random.randint(8, 13) / 1000 for _ in range(x3)], reverse=True)
t4 = sorted([random.randint(12, 20) / 1000 for _ in range(x4)])
t = t1 + t2 + t3 + t4
for i in (zip(x, y, t)):
track_list.append(i)
return (track_list, x1 + x2, x1 + x2 + x3)
# 对于点击验证码,获取验证码的校验文字和待点击图片截图,以及验证码弹框元素
def get_click_images(self):
'''获取需点击的图片
return:
需点击坐标的图片,
提示图片(用于调试打码时的计算点击次数),
验证码图片定位元素(用于定位鼠标位置并计算相对坐标)
'''
click_img_element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_widget")))
self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_item_img")))
time.sleep(random.randint(1, 5) / 10)
click_position = self.get_position(click_img_element)
all_screenshot = self.get_screenshot()
click_img = all_screenshot.crop(click_position)
click_img.save("click_img.png")
tip_img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_tip_img")))
tip_position = self.get_position(tip_img)
tip_img = all_screenshot.crop(tip_position)
tip_img.save("tip_img.png")
return (click_img, tip_img, click_img_element)
# 计算要点击的字符数量,灰度化,反向二值化,转置,沿X坐标对Y求和,判断分割点数量,判断字符数量
def cal_char_num(self, char_img_path):
'''计算需点击的字符数量
args:
char_img_path:提示图片的存储路径
return:
点击次数
'''
flag = 0
origin_img = cv2.imread(char_img_path)
gray_img = cv2.cvtColor(origin_img, cv2.COLOR_BGR2GRAY)
ret, thresh1 = cv2.threshold(gray_img, 127, 255, cv2.THRESH_BINARY_INV)
transpos_img = np.array(thresh1).T
result = list(map(lambda x: sum(x), transpos_img))
for i in range(len(result) - 3):
if result[i] == 0 and result[i + 1] == 0 and result[i + 2] > 0:
flag += 1
return flag
# 返回验证码字符的坐标,每个点击点的坐标,并转化为整数坐标
def char_absolute_coord(self, img, num, coord=None):
'''调试用,点击验证码图片返回整数值坐标
args:
img:验证码图片
num:点击次数
kargs:
coord:验证码字符坐标
return:
字符坐标
'''
if coord is None:
coord = []
img = Image.open(img)
plt.imshow(img)
points = plt.ginput(num)
plt.close()
for i in points:
x_co, y_co = i
coord.append((round(x_co), round(y_co)))
return coord
# 返回从起点开始依次到每个点击文字的相对位置,形式为[(xoffset,yoffset),(),(),...]
def get_offset_coord(self, absolute_coord, click_track=None):
'''获取相邻点击字符的相对坐标,用于鼠标移动点击
args:
absolute_coord:验证码字符的绝对坐标
kargs:
click_track:每个需点击字符间的相对坐标或位移
return:
相对坐标或位移
'''
if click_track is None:
click_track = []
for i, j in enumerate(absolute_coord):
if i == 0:
click_track.append(j)
else:
click_track.append((j[0] - absolute_coord[i - 1][0], j[1] - absolute_coord[i - 1][1]))
return click_track
# 验证点击验证码,获取验证码数量,人工点击,按照计算的坐标相对偏移位置,依次点击文字进行验证
# 通过打码平台,将验证码图片发送后返回坐标信息,通过超级鹰打码平台
def click_captcha_validate(self):
'''根据打码平台返回的坐标进行验证
return:
仅仅用于方法返回
'''
click_img, tip_img, click_img_element = self.get_click_images()
bytes_array = BytesIO()
click_img.save(bytes_array, format="PNG")
coord_result = self.gt_shot.PostPic(bytes_array.getvalue(), "9005")
print(coord_result)
groups = coord_result.get("pic_str").split('|')
if groups == "":
raise RuntimeError("打码超时")
pic_id = coord_result.get("pic_id")
points = [[int(num) for num in group.split(',')] for group in groups]
# tip_img_path="D:\\Anaconda3\\Lib\\captcha\\gt_validate\\tip_img.png"
# click_img_path="D:\\Anaconda3\\Lib\\captcha\\gt_validate\\click_img.png"
# num=self.cal_char_num(tip_img_path)
# points=self.char_absolute_coord(click_img_path,num)
mouse_track = self.get_offset_coord(points)
print(mouse_track)
self.action.move_to_element_with_offset(click_img_element, 0, 0)
for position in mouse_track:
self.action.move_by_offset(position[0], position[1])
self.action.click()
self.action.pause(random.randint(3, 7) / 10)
self.action.perform()
time.sleep(random.randint(4, 6) / 10)
click_submit_btn = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_commit_tip')))
click_submit_btn.click()
self.action.reset_actions()
self.valide_process(pic_id=pic_id)
return
# 验证滑动验证码,获取滑动距离和滑动轨迹,分别在起始,中间,结束时随机停顿
def slide_captcha_validate(self):
'''滑动验证码验证
return:
仅仅用于方法返回
'''
self.get_slide_images()
distance = self.get_slide_distance()
track, p1, p2 = self.get_track(distance)
time.sleep(random.randint(3, 7) / 10)
for i, j in enumerate(track):
if i == p1 or i == p2:
time.sleep(random.randint(3, 7) / 10)
self.action.move_by_offset(j[0], j[1])
time.sleep(j[2])
time.sleep(random.randint(3, 7) / 10)
self.action.release()
self.valide_process()
return
# 验证是否成功破解,设置重启机制
# 超过最大验证次数需点击“点击此处重试”
def valide_process(self, pic_id=None):
'''验证过程
1>判断极验弹框消失且查询结果框出现,验证成功,结束验证;
2>第一步验证失败,超时;
3>超时原因:极验验证框没消失(跳转至第4步)或查询结果框没出现(跳转至第6步);
4>极验验证框没消失,检验是否超过最大验证次数,如果是,需点击重试,跳至第7步,如果不是,跳至第5步;
5>如果不是,判断验证类型,调用响应验证方法,跳至第1步;
6>如果查询结果框没出现,直接退出关闭浏览器;
7>点击重试时,如果是空白响应则退出浏览器,或者判断验证类型,调用响应验证方法,跳至第1步。
args:
cap_type:验证码类型
pic_id:点击类验证码图片id
return:
要么验证成功,要么退出浏览器
'''
try:
WebDriverWait(self.driver, 3).until_not(
EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.geetest_panel")))
WebDriverWait(self.driver, 10).until(EC.visibility_of_element_located((By.ID, "advs")))
print("Validate Successful")
return
except TimeoutException:
try:
gt_panel_error = self.driver.find_element_by_css_selector(
"body > div.geetest_panel.geetest_wind > div.geetest_panel_box > div.geetest_panel_error")
error_display = gt_panel_error.value_of_css_property("display")
if error_display.strip() == "block":
gt_panel_error_content = self.driver.find_element_by_css_selector(
".geetest_panel_error > div.geetest_panel_error_content")
self.action.move_to_element(gt_panel_error_content).click().perform()
self.action.reset_actions()
try:
WebDriverWait(self.driver, 3).until_not(
EC.visibility_of_element_located((By.CSS_SELECTOR, "body > div.geetest_panel")))
WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_id('advs').is_displayed())
print("Validate Successful")
return
except TimeoutException:
self.slide_orclick_validate(pic_id)
else:
self.slide_orclick_validate(pic_id)
except:
print('error occured')
return
# 判断是执行点击还是滑块
def slide_orclick_validate(self, pic_id=None):
'''判断下一步是选择滑动验证还是点击验证还是退出浏览器
args:
pic_id:点击类验证码图片id
return:
要么滑动验证,要么点击验证,要么None
'''
try:
WebDriverWait(self.driver, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "geetest_close")))
print('Validate Failed,retry again')
if self.is_element_exist("geetest_canvas_img"):
print('captcha type is slide')
return self.slide_captcha_validate()
else:
print('captcha type is click')
if self.click_valitimes > 0:
self.gt_shot.ReportError(pic_id)
self.click_valitimes += 1
return self.click_captcha_validate()
except:
print("Directly no click or slide validate")
return
# 带cookie切换至首页继续检索
def switch_hmpg(self):
'''由结果页切换至首页
return: 用于方法返回
'''
self.wait.until(EC.presence_of_element_located((By.ID, "advs")))
hmpg_btn = self.driver.find_element_by_css_selector(
"body > div.container > div.header_box > div > div > a:nth-child(1)")
self.action.move_to_element(hmpg_btn).click().perform()
self.action.reset_actions()
self.wait.until(lambda x: x.find_element_by_id('btn_query').is_displayed())
return
# 通过index界面或者点击首页继续检索时的爬取步骤
def main(self, keyword, start_pg=None):
'''操作主程序
args:
keyword:查询关键词
kargs:
start_pg:是否需要初始化访问加速乐,默认要
'''
if start_pg == "homepage":
self.switch_hmpg()
else:
self.init()
self.input_query(keyword)
self.slide_orclick_validate()
# 保存cookie和检索结果,用于requests及详情解析
def to_dict(self):
'''保存cookie(用于requests请求及详情解析)和查询结果
args:
cookie_name:cookie文件名称
'''
htmlpage = self.driver.page_source
return {
'page': htmlpage
}
if __name__ == '__main__':
init_url = "http://www.gsxt.gov.cn/SearchItemCaptcha"
index_url = "http://www.gsxt.gov.cn/index.html"
base_url = 'http://www.gsxt.gov.cn'
result_parse_rule = {'search_result_url': '//*[@id="advs"]/div/div[2]/a/@href'}
detail_parse_rule = {
'primaryinfo': ['string(//*[@id="primaryInfo"]/div/div[@class="overview"]/dl[{}])'.format(i) for i in
range(15)], }
max_click = 10
chm_headers = ['Host="www.gsxt.gov.cn"',
'Connection="keep-alive"',
'User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"',
'Upgrade-Insecure-Requests=1',
'Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"',
'Accept-Encoding="gzip, deflate"',
'Accept-Language="zh-CN,zh;q=0.9"']
search = CorpSearch(init_url, index_url, chm_headers, max_click)
search.main("腾讯")
cookie_html = search.to_dict()
search_result = SearchResultParse(cookie_html['page'], base_url, result_parse_rule)
url_list = search_result.search_result_parse()
detail_request = CookieRequest(url_list=url_list)
detail_result = detail_request.cookie_requests()
for pg in detail_result:
pg_detail = PageDetailParse(pg, detail_parse_rule)
detail = pg_detail.search_result_parse()
m = re.findall(r'\[(.*?)\]', str(detail))
info_list = m[0].replace('\'', '').split(', ')
sql = "insert into company(code,name,type,start,end,) values(%s,%s,%s,%s.%s)"
count, rt_list = MysqlConnection.execute_sql(sql, (info_list[0],info_list[1],info_list[2],info_list[3]))
class EnterPriseSpider(scrapy.Spider):
name = 'enterprise'
allowed_domains = ['gsxt.gov.cn']
start_urls = ['http://www.gsxt.gov.cn/index.html']
def __init__(self, word=None, *args, **kwargs):
super(eval(self.__class__.__name__), self).__init__(*args, **kwargs)
self.word = word
def start_requests(self):
init_url = "http://www.gsxt.gov.cn/SearchItemCaptcha"
index_url = "http://www.gsxt.gov.cn/index.html"
base_url = 'http://www.gsxt.gov.cn'
result_parse_rule = {'search_result_url': '//*[@id="advs"]/div/div[2]/a/@href'}
max_click = 10
chm_headers = ['Host="www.gsxt.gov.cn"',
'Connection="keep-alive"',
'User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"',
'Upgrade-Insecure-Requests=1',
'Accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"',
'Accept-Encoding="gzip, deflate"',
'Accept-Language="zh-CN,zh;q=0.9"']
search = CorpSearch(init_url, index_url, chm_headers, max_click)
search.main(self.word)
cookie_html = search.to_dict()
search_result = SearchResultParse(cookie_html['page'], base_url, result_parse_rule)
url_list = search_result.search_result_parse()
yield Request(url="https://www.baidu.com/",callback=self.parse,
meta={'url_list': url_list})
def parse(self, response):
detail_parse_rule = {
'primaryinfo': ['string(//*[@id="primaryInfo"]/div/div[@class="overview"]/dl[{}])'.format(i) for i in
range(15)], }
url_list = response.meta.get("url_list", "")
detail_request = CookieRequest(url_list=url_list)
detail_result = detail_request.cookie_requests()
for pg in detail_result:
pg_detail = PageDetailParse(pg, detail_parse_rule)
detail = pg_detail.search_result_parse()
m = re.findall(r'\[(.*?)\]', str(detail))
info_list = m[0].replace('\'', '').split(', ')
item = CompanyItem()
item['name'] = company_info(info_list, "企业名称:")
item['code'] = company_info(info_list, "统一社会信用代码:")
item['type'] = company_info(info_list, "类型:")
start = company_info(info_list, "营业期限自:")
partner_start = company_info(info_list, "合伙期限自:")
item['start'] = start if "无" == partner_start else partner_start
end = company_info(info_list, "合伙期限自:")
partner_end = company_info(info_list, "合伙期限至:")
item['end'] = end if "无" == partner_end else partner_end
item['capital'] = company_info(info_list, "注册资本:")
item['owner'] = company_info(info_list, "法定代表人:")
item['establish'] = company_info(info_list, "成立日期:")
item['registration'] = company_info(info_list, "登记机关:")
item['check'] = company_info(info_list, "核准日期:")
item['status'] = company_info(info_list, "登记状态:")
residence = company_info(info_list, "住所:")
premises = company_info(info_list, "主要经营场所:")
item['address'] = residence if "无" == premises else premises
item['scope'] = company_info(info_list, "经营范围:")
item['partner'] = company_info(info_list, "执行事务合伙人:")
yield item
from scrapy import cmdline
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# execute(["scrapy", "crawl", "enterprise","-a","word=百度"])
execute(["scrapy", "crawl", "zhilian"])