[Python]网络爬虫(六) 一个刷投票小脚本

#描述
脚本一共有两版,分别使用了BeautifulSoup+selenium +chrome和BeautifulSoup+selenium +firefox。实现了对于某网站刷投票的功能。
1.该网站投票选项每次刷新页面后会重新随机排列,但是id不可能改变,因此可以通过 browser.find_element_by_id(ID).click(),进行操作。
2.该网站选满有且只有10个选项才能投票,并且会弹窗提示。
3.该网站限制IP投票次数。
4.该网站限制每次选择选项的间隔时间。
5.网站是ajax异步更新。

#准备工作

####下载chrome浏览器和chromedriver
chromedriver下载地址:
http://chromedriver.storage.googleapis.com/index.html (有墙)
http://npm.taobao.org/mirrors/chromedriver/ (无墙)

####下载火狐浏览器和geckodriver
geckodriver 下载地址:
https://github.com/mozilla/geckodriver/releases/

##BeautifulSoup+selenium +chrome

from bs4 import BeautifulSoup
from selenium import webdriver
import time
from random import sample,choice
import requests
import re

def scanWeb(addr,ips=[]):
    """执行刷票"""
    success=0
    for num,ipport in enumerate(ips):
        result=str(num)+". 代理:"+ipport

        #设置headers
        options =  webdriver.ChromeOptions()
        options.add_argument('lang=zh_CN.UTF-8')
        options.add_argument('user-agent="'+selectUserAgent()+'"')
        #设置代理
        options.add_argument('--proxy-server=http://'+ipport)

        browser = webdriver.Chrome(chrome_options=options)
        browser.set_page_load_timeout(10)                                        #设置超时报错
        browser.set_script_timeout(10)                                           #设置脚本超时时间。
        try:
            browser.get(addr)
        except:
            print("加载页面太慢,停止加载,继续下一步操作")
            browser.execute_script("window.stop()")

        #等待页面更新完毕
        time.sleep(3)
        sltI=selectItemID()
        #给10个项目投票
        try:
            for ID in sltI:
                browser.find_element_by_id(ID).click()
                time.sleep(1)
                # js2 = "var q=document.getElementById('"+ID+"').click()"
                # browser.execute_script(js2)
        
        #消除屏幕弹窗
            time.sleep(1)
            al = browser.switch_to_alert()
            al.accept()

            browser.find_element_by_class_name("btn").click()                        #提交

        except:
            print(result+"失败")
            browser.quit()
            continue

        success+=1
        time.sleep(2)
        browser.quit()
        print("{0}成功,共成功{1}次".format(result,success))


def selectItemID():
    """获取4个必选项目以及6个随机项目ID"""
    cmID = ["v1275", "v1270", "v1300", "v1278"]
    othID = ['v1344', 'v1267', 'v1268', 'v1280', 'v1304', 'v1148', 'v1283', 'v1276', 'v1274',
     'v1288', 'v1222','v1286', 'v1277', 'v1303', 'v1285', 'v1273', 'v1309', 'v1305', 'v1284',
      'v1282']
    sltID=cmID+sample(othID,6)
    return sltID

def selectUserAgent():
    """随机获取一个useragent"""
    uas = [
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
        "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
        "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
        "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
        "Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0",
        "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
        "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
    ]
    return choice(uas)

def get_ip():
    """获取代理IP"""
    url = "http://www.xicidaili.com/nn"
    headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
                "Accept-Encoding":"gzip, deflate, sdch",
                "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                "Referer":"http://www.xicidaili.com",
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
                }
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.table.find_all("td")
    ip_compile= re.compile(r'(\d+\.\d+\.\d+\.\d+)')    # 匹配IP
    port_compile = re.compile(r'(\d+)')                # 匹配端口
    ip = re.findall(ip_compile,str(data))                       # 获取所有IP
    port = re.findall(port_compile,str(data))                   # 获取所有端口
    return [":".join(i) for i in zip(ip,port)]                  # 组合IP+端口,如:115.112.88.23:8080


scanWeb("http://xxxxxxxxxxxxxxxxxxxxxxxxxx/",get_ip())          

##BeautifulSoup+selenium +firefox

from bs4 import BeautifulSoup
from selenium import webdriver
import time
from random import sample,choice
import requests
import re

def scanWeb(addr,ips=[]):
    """执行刷票"""
    for num,ipstring in enumerate(ips):
        ip,port=ipstring.split(":")

        profile = webdriver.FirefoxProfile()
        # 设置代理
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.http", ip)
        profile.set_preference("network.proxy.http_port", port)
        profile.set_preference("general.useragent.override",selectUserAgent())
        profile.update_preferences()
        driver = webdriver.Firefox(profile)

        driver.set_page_load_timeout(10)                                        #设置超时报错
        driver.set_script_timeout(10)                                           #设置脚本超时时间。
        #now_handle = driver.current_window_handle                               #获得当前句柄
        try:
            driver.get(addr)
        except:
            print("加载页面太慢,停止加载,继续下一步操作")
            browser.execute_script("window.stop()")

        #等待页面更新完毕
        time.sleep(3)
        sltI=selectItemID()
        #给10个项目投票
        try:
            for ID in sltI:
                js2 = "var q=document.getElementById('"+ID+"').click()"
                driver.execute_script(js2)

            time.sleep(3)

            al = driver.switch_to_alert()
            al.accept()

            driver.find_element_by_class_name("btn").click()

        except:
            pass

        driver.quit()
def selectUserAgent():
    """随机获取一个useragent"""
    uas = [
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
        "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
        "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
        "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
        "Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0",
        "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11",
        "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
    ]
    return choice(uas)

def selectItemID():
    """获取4个必选项目以及6个随机项目ID"""
    cmID = ["v1275", "v1270", "v1300", "v1278"]
    othID = ['v1344', 'v1267', 'v1268', 'v1280', 'v1304', 'v1148', 'v1283', 'v1276', 'v1274',
     'v1288', 'v1222','v1286', 'v1277', 'v1303', 'v1285', 'v1273', 'v1309', 'v1305', 'v1284',
      'v1282']
    sltID=cmID+sample(othID,6)
    return sltID


def get_ip():
    """获取代理IP"""
    url = "http://www.xicidaili.com/nn"
    headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
                "Accept-Encoding":"gzip, deflate, sdch",
                "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
                "Referer":"http://www.xicidaili.com",
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
                }
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    data = soup.table.find_all("td")
    ip_compile= re.compile(r'(\d+\.\d+\.\d+\.\d+)')    # 匹配IP
    port_compile = re.compile(r'(\d+)')                # 匹配端口
    ip = re.findall(ip_compile,str(data))                       # 获取所有IP
    port = re.findall(port_compile,str(data))                   # 获取所有端口
    return [":".join(i) for i in zip(ip,port)]                  # 组合IP+端口,如:115.112.88.23:8080


scanWeb("http://xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",get_ip())

欢迎关注我的公众号。
这里写图片描述

你可能感兴趣的:(Python)