python crawler - 使用代理增加博客文章访问量

import re , random , requests , logging
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool

logging.basicConfig(level=logging.DEBUG)
TIME_OUT = 15
proxies = []
header = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
    'Accept-Encoding' : 'gzip, deflate, br',
    'Accept-Language' : 'zh-CN,zh-TW;q=0.9,zh;q=0.8,pl;q=0.7,en;q=0.6' ,
    'Connection' : 'keep-alive'
}

PROXY_URL = 'http://www.66ip.cn/areaindex_1/1.html'	# 代理网站


# 从代理网站中爬取可用ip及端口
def GetProxies():
    global proxies
    try:
        text = requests.get(PROXY_URL,headers=header).text
    except:
        logging.error('proxy failed!')
        return
    html = etree.HTML(text)
    ips = []
    ips = html.xpath("//div[@id='footer']/div/table/tr/td[1]/text()")
    ports = html.xpath("//div[@id='footer']/div/table/tr/td[2]/text()")
    ips = ips[1:-1]
    ports = ports[1:-1]

    for i in range(len(ips)):
        proxies.append(
            dict(
                http='{}:{}'.format(ips[i],ports[i])
            )
        )

    # print(proxies)

# 获取文章链接
def GetArticles(url):
    res = GetRequest(url,prox=None)
    html = etree.HTML(res.text)

    urls_list = []
    urls_list = html.xpath("//*[@id='mainBox']/main/div/div/h4/a/@href")

    return urls_list

    # print(urls_list)

def GetRequest(url,prox):
    req = requests.get(url,headers=header,proxies=prox,timeout=TIME_OUT)
    return req

def VisitWithProxy(url):
    proxy = random.choice(proxies)
    GetRequest(url,proxy)

def VisitLoop(url):
    for i in range(count):
        logging.debug('Visiting:\t{}\tfor {} times'.format(url,i+1))
        VisitWithProxy(url)

if __name__ == '__main__':
    global count
    GetProxies()
    logging.debug('We got {} proxies'.format(len(proxies)))
    BlogUrl = input('Blog Address').strip(' ')
    logging.debug('Gonna visite {}'.format(BlogUrl))
    try:
        count = int(input('Visit Count:'))
    except ValueError:
        logging.error('Arg error')
        quit()
    if count == 0 or count > 200 :
        logging.error('Count illegal')
        quit()

    article_list = GetArticles(BlogUrl)
    if len(article_list) == 0 :
        logging.error('No article , error !')
        quit()


# 多线程
    pool = ThreadPool(int(len(article_list) / 4 ))
    results = pool.map(VisitLoop,article_list)
    pool.close()
    pool.join()
    logging.debug('Done!')

你可能感兴趣的:(爬虫,python)