Python3.7 + Selenium + BeautifulSoup4 + Requests + Threading 爬虫异步加载网站

一个用于爬取采用JS脚本防爬虫漫画网站的脚本
具体看注释

# coding=utf-8
import pdfkit
import requests

from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import os,time,threading
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.common.by import By # 标识
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

#网页分析,获取网址和标题
def parse_url_to_html(url,name, istart, iend):
    heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    response=requests.get(url,headers = heads)
    print(response.status_code)
    soup=BeautifulSoup(response.content,"html.parser")
    mainpages = []
    maintitles= []
    allpages  = []#获取所有的网址
    alltitles = []#获取对应的标题
    tag_main  = soup.find_all(class_ = "cy_plist")[0]#获取第一个id为"nav"的标签,这个里面包含了网址和标题
    
    for i in tag_main.find_all("li"):
        
        if i == None:
            continue
        else:
            mainpages.append(i.a.get('href'))
            maintitles.append(i.a.get_text())
    mainpages.reverse()
    maintitles.reverse()
    
    print("write begin++++++++++++++++>>>>>>>>>>>>>....")
    #获取的只是标签集,需要加html前缀
    suffix = '_' + str(istart) + '-' + str(iend)
    #htmls = " \n"#+str(tag_main)
    #with open(name+suffix+".html",'w',encoding='utf-8') as f:
    #    f.write(htmls)
    #with open("stat.pic"+suffix,'w',encoding='utf-8') as f:
    #    f.write("stats picture info \n")
    print(mainpages)
    
    return mainpages, maintitles

def downlaodImage(url, maintitles, chapter, istart, iend):
      
    heads = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
    
    tag = '<br/><div class="pr tbCenter clearfix" id="tbCenter"> <br/> \n \
               <div id="images"> <br/> \n \
               </div><br/>' + '<div style="text-align:center"> \n '
    tag1 = '

'+maintitles +'


\n '
options = webdriver.ChromeOptions() options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"') # 下面两行代码关闭窗口的显示出现 options.add_argument('--headless') options.add_argument('--disable-gpu') #不加载图片 prefs = {"profile.managed_default_content_settings.images":2} options.add_experimental_option("prefs",prefs) driver=webdriver.Chrome('C:\\Users\\test\\Desktop\\chromedriver_win32\\chromedriver.exe',options=options) driver.get(url) # 10s 内只要出现id为k——next的标签,就停止加载 element = WebDriverWait(driver, 1000).until(EC.presence_of_element_located((By.ID, "k_next"))) soup=BeautifulSoup(driver.page_source,"lxml") tag_tmp=soup.find_all('a',attrs={'href':'javascript:;','class':'BarTit'}) npicutre = int(tag_tmp[0].getText().split('/')[1][:-1]) suffix = '_' + str(istart) + '-' + str(iend) for ipic in range(npicutre): soup=BeautifulSoup(driver.page_source,"html.parser") # 注意BS4的特殊用法,避免使用正则表达式 tag_tmp=soup.find_all('div', attrs={'class':'mh_box'}) filepath = tag_tmp[0].img['src'] # with open('log_ljm/' + str(chapter) + '-' + str(ipic+1) + '.log','w',encoding='utf-8') as f: # f.write(str(driver.page_source)) filename = str(chapter) + '-' + str(ipic+1) +'.jpg' response = requests.get(filepath, headers = heads) #with open("stat.pic"+suffix,'a',encoding='utf-8') as f: # f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + ' ' + response.url + '\n') if os.path.exists('imagehuyao/' + filename): pass else: response = requests.get(filepath, headers = heads) urlretrieve(response.url, 'imagehuyao/' + filename) # with open("stat.pic",'a',encoding='utf-8') as f: # f.write('%05a' %chapter + '%03a' %ipic + '%03a' %npicutre + ' ' + response.url + '\n') tag = tag + '+filename + '" height="1132" width="800" style>

\n'
# 使用Selenium模拟执行js脚本(下一页) driver.execute_script("javascript:a_f_qTcms_Pic_nextUrl_Href();") #del(url1, filepath, soup, tag_tmp, driver, filename, response) driver.close() tag = tag1 + tag + '
'+ '
' return tag #网页操作,获取正文及 def get_htmls(url, mainpages,maintitles, name, istart, iend): if not url.endswith('/'): url = url+'/' url = url.split("index.html/")[0] #istart = 0 suffix = '_' + str(istart) + '-' + str(iend) for i in range(istart, iend, 1): #len(mainpages): urll = urljoin(url, mainpages[i]) # ########## 必须谨记 print(urll) if urll == None: h = "

"+maintitles[i]+"

"
#htmls = '' #htmls= h + htmls #str(tag[0]) #with open(name+".html",'a',encoding='utf-8') as f: # f.write(htmls) continue tag = downlaodImage(urll,maintitles[i], i, istart, iend) #time.sleep(100) htmls= tag #with open(name+suffix+".html",'a',encoding='utf-8') as f: # f.write(htmls) # print(" (%s) [%s] download end"%(i,mainpages[i])) # 闭合parse_url_to_html中的标签 #htmls=" \n" #with open(name+".html",'a',encoding='utf-8') as f: # f.write(htmls) def save_pdf(html,name): """ 把所有html文件转换成pdf文件 """ options = { 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': "UTF-8", 'custom-header': [ ('Accept-Encoding', 'gzip') ], 'cookie': [ ('cookie-name1', 'cookie-value1'), ('cookie-name2', 'cookie-value2'), ], 'outline-depth': 10, 'footer-font-name':'Times New Roman', 'header-font-name':'Times New Roman', 'minimum-font-size':24, } pdfkit.from_file(html, name+".pdf", options=options) def get_url(url_mainpage): response=requests.get(url_mainpage) print(response.status_code) soup=BeautifulSoup(response.content,"html.parser") s=[]#获取所有的网址 title=[]#获取对应的标题 tag=soup.find_all(class_ = "thumbnail")#获取第一个id为"nav"的标签,这个里面包含了网址和标题 for i in tag: tmp = i.a.get('href') if tmp.startswith("http"): s.append(i.a.get('href')) title.append(i.img.get('alt')) return s, title def runfile(url, name, istart, iend): suffix = '_' + str(istart) + '-' + str(iend) if os.path.exists(name+suffix+'.html'): os.remove(name+suffix+'.html') if os.path.exists(name+suffix+'.pdf'): os.remove(name+suffix+'.pdf') mainpages, maintitles = parse_url_to_html(url,name, istart, iend) get_htmls(url,mainpages, maintitles, name, istart, iend) if __name__ == '__main__': url = '一个漫画网站' name = '灵剑山' threads = [] start = 400 end = 450 step = 1 for i in range(start, end, step): istart = i iend = i + step t = threading.Thread(target=runfile,args=(url,name, istart, iend)) threads.append(t) for i in range(len(range(start, end, step))): threads[i].setDaemon(True) threads[i].start() for i in range(len(range(start, end, step))): threads[i].join()

你可能感兴趣的:(基础,Selenium,Threading,BeautifulSoup4,灵剑山漫画)