使用selenium爬取百度搜索的URL

我使用selenium简单的爬取搜索的URL,这应该对于那自动测试漏洞有用,我想使用谷歌搜索的,奈何没钱买代理,Google 语法感觉比百度语法有用多了,

代码

# -*- coding: utf-8 -*-
"""
Created on Sat May  2 15:17:58 2020

@author: 14504
"""


from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq 
import requests
import time

url_save_path="./url.txt"
SearchInformation="inurl: (admin)"
starPage=1   #页数
endPage=1

# 添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)

#browser = webdriver.Chrome()
wait= WebDriverWait(browser,10)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
    }



def searchURL(page):
    pageScema="&pn="+str(page)
    url="https://www.baidu.com/s?wd="+quote(SearchInformation)+pageScema
    try:
        browser.get(url)
        urlnum=geturl()
        return urlnum
    
    except TimeoutException:
        print("请求超时")

def geturl():
    urlnum=0;
    html=browser.page_source 
    doc=pq(html)
    items = doc('div#content_left  .result.c-container').items()
    for item in items:
       BDurl=item.children('div.f13 > a').attr('href')
       real_url=urlDecode(BDurl)
       if real_url=="":
           print("none")
       else:
           saveTotxt(real_url)
           urlnum=urlnum+1
    print("这一页成功爬取了"+str(urlnum)+"个\n")
    return urlnum
    
#百度url解码    
def urlDecode(BDurl):    
    try:
        res = requests.get(BDurl,allow_redirects=False) 
        Real_url=res.headers['Location']
        return Real_url
    except requests.exceptions.ConnectionError as e:
        print('ConnectionError', e.args)
        return("")

    except requests.exceptions.MissingSchema as e:
        print('Schema is none', e.args)
        return("")

    except:
        return("")
        

def saveTotxt(real_url):
    with open(url_save_path, 'a', encoding='utf-8') as file:
        file.write(real_url)
        file.write("\n")

def main():
    urlsum=0
    for page in range(starPage-1,endPage):
        print("正在爬取第"+str(page+1)+"页")
        page=page*10
        urlnum=searchURL(page)
        urlsum=urlnum+urlsum      
        time.sleep(1)
    
    print("成功爬取"+str(urlsum)+"个url地址")
        

main()

 

你可能感兴趣的:(Python)