我使用selenium简单的爬取搜索的URL,这应该对于那自动测试漏洞有用,我想使用谷歌搜索的,奈何没钱买代理,Google 语法感觉比百度语法有用多了,
代码
# -*- coding: utf-8 -*-
"""
Created on Sat May 2 15:17:58 2020
@author: 14504
"""
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq
import requests
import time
url_save_path="./url.txt"
SearchInformation="inurl: (admin)"
starPage=1 #页数
endPage=1
# 添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
#browser = webdriver.Chrome()
wait= WebDriverWait(browser,10)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def searchURL(page):
pageScema="&pn="+str(page)
url="https://www.baidu.com/s?wd="+quote(SearchInformation)+pageScema
try:
browser.get(url)
urlnum=geturl()
return urlnum
except TimeoutException:
print("请求超时")
def geturl():
urlnum=0;
html=browser.page_source
doc=pq(html)
items = doc('div#content_left .result.c-container').items()
for item in items:
BDurl=item.children('div.f13 > a').attr('href')
real_url=urlDecode(BDurl)
if real_url=="":
print("none")
else:
saveTotxt(real_url)
urlnum=urlnum+1
print("这一页成功爬取了"+str(urlnum)+"个\n")
return urlnum
#百度url解码
def urlDecode(BDurl):
try:
res = requests.get(BDurl,allow_redirects=False)
Real_url=res.headers['Location']
return Real_url
except requests.exceptions.ConnectionError as e:
print('ConnectionError', e.args)
return("")
except requests.exceptions.MissingSchema as e:
print('Schema is none', e.args)
return("")
except:
return("")
def saveTotxt(real_url):
with open(url_save_path, 'a', encoding='utf-8') as file:
file.write(real_url)
file.write("\n")
def main():
urlsum=0
for page in range(starPage-1,endPage):
print("正在爬取第"+str(page+1)+"页")
page=page*10
urlnum=searchURL(page)
urlsum=urlnum+urlsum
time.sleep(1)
print("成功爬取"+str(urlsum)+"个url地址")
main()