Python使用代理IP刷CSDN文章阅读量

Python使用代理IP刷CSDN文章阅读量


为防止用自己的电脑频繁访问网页被屏蔽,所以爬取代理IP来访问我们的目标网页。

import requests
from bs4 import BeautifulSoup
import random
import time
#创建Proxyhandler类
class Proxyhandler(object):
    def __init__(self):
    	#伪装为浏览器访问的header列表
        self.user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
        self.proxy_url = 'https://www.xicidaili.com/wn/'
        #获取的代理IP列表
        self.proxy_list = []
        #目标网址
        self.target_url = 'https://blog.csdn.net/weixin_39549161/article/details/86751162'
        self.time_out = 3
	#爬取代理IP存入列表,并删除无效IP,设定爬取10页的IP
    def get_proxy_list(self,page_num = 10):
		#从user_agent_list列表中随机选取一个header
        ua = random.choice(self.user_agent_list)
        header = {'User-Agent': ua}
        print('随机产生的UA:%s' % ua)
        try:
            for i in range(page_num):
                url = self.proxy_url + str(i + 1)
                r = requests.get(url, headers=header, timeout=self.time_out)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                html = r.text
                soup = BeautifulSoup(html,'html.parser')
                tr = soup.find_all('tr')
                for i in range(1,len(tr)):
                    t = tr[i]
                    td = t.find_all('td')
                    self.proxy_list.append(td[1].text + ':' + td[2].text)
            for ip in self.proxy_list:
                try:
                    proy_host = 'https://' + ip
                    proxy = {'https:':proy_host}
                    response = requests(self.target_url,headers=header,proxies = proxy)
                #删除列表中出现异常的无效IP
                except Exception as e:
                    self.proxy_list.remove(ip)
                    continue
            #返回有效IP列表
            return self.proxy_list
        except:
            print('get error!')
	#通过爬取的代理IP访问目标网址
    def get_target_html(self,list):
        try:
            ua = random.choice(self.user_agent_list)
            header = {'User-Agent': ua}
            proy_host = 'https://' + str(random.choice(list))
            proxy = {'https:': proy_host}
            print(proxy)
            r = requests.get(self.target_url, headers=header, proxies=proxy, timeout=30)
            r.encoding = r.apparent_encoding
            r.raise_for_status()
            soup = BeautifulSoup(r.text, 'html.parser')
            title = soup.find('title').string
            print(title)
        except:
            print('getHTML error')
if __name__ == '__main__':
	#实例化Proxyhandler类
    proxy_hander = Proxyhandler()
    list = proxy_hander.get_proxy_list()
    print(list)
    for i in range(10):
        proxy_hander.get_target_html(list)
        time.sleep(31)

你可能感兴趣的:(python爬虫学习)