import re , random , requests , logging
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
logging.basicConfig(level=logging.DEBUG)
TIME_OUT = 15
proxies = []
header = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
'Accept-Encoding' : 'gzip, deflate, br',
'Accept-Language' : 'zh-CN,zh-TW;q=0.9,zh;q=0.8,pl;q=0.7,en;q=0.6' ,
'Connection' : 'keep-alive'
}
PROXY_URL = 'http://www.66ip.cn/areaindex_1/1.html'
def GetProxies():
global proxies
try:
text = requests.get(PROXY_URL,headers=header).text
except:
logging.error('proxy failed!')
return
html = etree.HTML(text)
ips = []
ips = html.xpath("//div[@id='footer']/div/table/tr/td[1]/text()")
ports = html.xpath("//div[@id='footer']/div/table/tr/td[2]/text()")
ips = ips[1:-1]
ports = ports[1:-1]
for i in range(len(ips)):
proxies.append(
dict(
http='{}:{}'.format(ips[i],ports[i])
)
)
def GetArticles(url):
res = GetRequest(url,prox=None)
html = etree.HTML(res.text)
urls_list = []
urls_list = html.xpath("//*[@id='mainBox']/main/div/div/h4/a/@href")
return urls_list
def GetRequest(url,prox):
req = requests.get(url,headers=header,proxies=prox,timeout=TIME_OUT)
return req
def VisitWithProxy(url):
proxy = random.choice(proxies)
GetRequest(url,proxy)
def VisitLoop(url):
for i in range(count):
logging.debug('Visiting:\t{}\tfor {} times'.format(url,i+1))
VisitWithProxy(url)
if __name__ == '__main__':
global count
GetProxies()
logging.debug('We got {} proxies'.format(len(proxies)))
BlogUrl = input('Blog Address').strip(' ')
logging.debug('Gonna visite {}'.format(BlogUrl))
try:
count = int(input('Visit Count:'))
except ValueError:
logging.error('Arg error')
quit()
if count == 0 or count > 200 :
logging.error('Count illegal')
quit()
article_list = GetArticles(BlogUrl)
if len(article_list) == 0 :
logging.error('No article , error !')
quit()
pool = ThreadPool(int(len(article_list) / 4 ))
results = pool.map(VisitLoop,article_list)
pool.close()
pool.join()
logging.debug('Done!')