python3+beautifulSoup4.6抓取某网站小说(四)多线程抓取

上一篇多文章,是二级目录,根目录“小说”,二级目录“作品名称”,之后就是小说文件。

本篇改造了部分代码,将目录设置为根目录->作者目录->作品目录->作品章节.txt.

但这并不是本章内容当重点,重点是使用这个爬虫程序抓取当时候,经常会因为网络丢包等原因导致程序中断,

本来想着是循环获取网站状态,然后重新发起请求,结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程,正好就实验下,结果发现,速度很快,cool!

以下代码基本摘自虫师的selenium2

多线程的引用

import threading

方法调用:threading.Thread(target=music, args=('music方法参数1',music方法参数2) )

from time import sleep,ctime
import threading

def music(func,loop):
    for i in range(loop):
        print('music',func,ctime())
        sleep(2)

def movie(func,loop):
    for i in range(loop):
        print('movie',func,ctime())
        sleep(4)

def testOne():
    music('简单的歌', 2)
    movie('两杆大烟枪', 2)
    print('all end', ctime())
def testTwo(): threads = [] t1 = threading.Thread(target=music, args=('喜欢的人',2) ) threads.append(t1) t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) ) threads.append(t2) t3= threading.Thread(target=music, args=('喜欢的人2', 2)) threads.append(t3) for t in threads: t.start() for t in threads: t.join() print('all end', ctime())

if __name__ == '__main__':
testOne()
#testTwo()
#testThree()
#threadsRun()
t.join方法用来串联线程,可以保证all end 语句在最后打印出来。

  

 

创建线程管理类

创建类名时就引入Thread:class MyThread(threading.Thread)

class MyThread(threading.Thread):

    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args
        self.name = name

    def run(self):
        self.func(*self.args)

 self:类实例,默认参数

 func:调用方法名

   args:参数

   name:方法+".__name__"

完整代码:

 1 class MyThread(threading.Thread):
 2 
 3     def __init__(self, func, args, name):
 4         threading.Thread.__init__(self)
 5         self.func = func
 6         self.args = args
 7         self.name = name
 8 
 9     def run(self):
10         self.func(*self.args)
11 
12 def super_play(file_,time):
13     for i in range(3):
14         print('play', file_, ctime())
15         sleep(time)
16 
17 
18 def time(args):
19     pass
20 
21 
22 def testThree():
23     threads = []
24     lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2}
25     for file_, time_ in lists.items():
26         t = MyThread(super_play, (file_, time_), super_play.__name__)
27         threads.append(t)
28 
29     files = range(len(lists))
30 
31     for f in files:
32         threads[f].start()
33     for f in files:
34         threads[f].join()
35 
36     print('all end', ctime())
View Code

 

改造小说爬虫

好了,多线程说完了,怎么调用咱们写的小说类呢,很简单

首先,改造pageOne

    def readPageOneByThread(self,page,time_):
        page_url = str(self.two_page_url)
        new_page_url = page_url.replace("?", page)
        print('第', page, '页---', new_page_url)
        path = self.folder_path              
        self.readPageTwo(new_page_url, path)
        sleep(time_)
    # end readPageOneByThread  ---------------------------------------

 init方法中,self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

接下来,编写添加线程的方法:

    def threadsRun(self):

        #self.readPageOne(122)

        for i in range(1,123):
            page = str(i)
            t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
            #t = threading.Thread(target=self.testRun, args=( str(i) ))
            self.threads.append(t)

        for t in self.threads:
            t.start()
        for t in self.threads:
            t.join()
            #t.join()

        print('all end: %s' % ctime())


class MyThread(threading.Thread):

    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args
        self.name = name

    def run(self):
        self.func(*self.args)

  这里偷了个懒,直接写了总页数,其实也可以使用原来的pageone方法读取last的div获取页数

下面是完整代码:

  1 # -*- coding: UTF-8 -*-
  2 from urllib import request
  3 from bs4 import BeautifulSoup
  4 from time import sleep,ctime
  5 import os
  6 import threading
  7 import re
  8 import random
  9 
 10 '''
 11 使用BeautifulSoup抓取网页
 12 version:0.5 更新为本地缓存链接
 13 author:yaowei
 14 date:2018-03-23
 15 '''
 16 
 17 
 18 class Capture():
 19 
 20     def __init__(self):
 21         self.index_page_url = 'http://www.cuiweijuxs.com/'
 22         self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
 23         self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
 24         self.folder_path = '绯色/'
 25         self.href_list = []
 26         self.head = {}
 27         self.threads = []
 28         # 写入User Agent信息
 29         self.head[
 30             'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
 31 
 32     # end __init__ ---------------------------------------
 33 
 34     # 获取BeautifulSoup
 35     def getSoup(self,query_url):
 36         req = request.Request(query_url, headers=self.head)
 37         webpage = request.urlopen(req)
 38         html = webpage.read()
 39         soup = BeautifulSoup(html, 'html.parser')
 40         return soup
 41         # soup = BeautifulSoup(html, 'html5lib')
 42 
 43     # 读取分版页面,打开分页链接
 44     def readPageOne(self,count,time_):
 45 
 46         print('count=====',count)
 47 
 48         # 总页数
 49         if count :
 50             item_size = count
 51         else :
 52             # 读取页面
 53             soup = self.getSoup(self.one_page_url)
 54             last = soup.find("a", 'last')
 55             item_size = int(last.string)
 56 
 57         print('item_size=====',item_size)
 58         page_url = str(self.two_page_url)
 59 
 60         # 循环打开分页链接,读取分页页面
 61         for item in range(item_size):
 62             page = str(item + 1)
 63             new_page_url = page_url.replace("?", page)
 64             print('', page, '页---', new_page_url)
 65             path = self.folder_path
 66             self.readPageTwo(new_page_url, path)
 67 
 68         sleep(time_)
 69     # end readPageOne  ---------------------------------------
 70 
 71     def readPageOneByThread(self,page,time_):
 72         page_url = str(self.two_page_url)
 73         new_page_url = page_url.replace("?", page)
 74         print('', page, '页---', new_page_url)
 75         path = self.folder_path              
 76         self.readPageTwo(new_page_url, path)
 77         sleep(time_)
 78     # end readPageOneByThread  ---------------------------------------
 79 
 80     # 读取分页页面
 81     def readPageTwo(self, page_url, path):
 82         soup = self.getSoup(page_url)
 83         # first div[id="newscontent"]->div[class="l"]
 84         con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'})
 85         # first div[id="newscontent"]->div[class="l"]->all spann[class="s2"]
 86         span_list = con_div.find_all('span', {'class': 's2'})
 87 
 88         # 遍历span
 89         for span in span_list:
 90             # 找到父节点下的span[class="s5"],以作者为文件夹名字
 91             author = span.parent.find('span', {'class': 's5'}).get_text()
 92 
 93             # span[class="s2"]->a
 94             a_href = span.find('a')
 95             href = a_href.get('href')  # 单部作品链接
 96             folder_name = a_href.get_text()  # 作品名字
 97             print('a_href', href, '---folder_name', folder_name)
 98             new_path = path + '/' + author + '/' + folder_name
 99             self.createFolder(new_path)  # 创建文件夹
100 
101             self.readPageThree(href, new_path)  # 读取单部作品
102 
103             # t = threading.Thread(target=self.readPageThree, args={href, new_path})
104             # self.threads.append(t)
105             # end for
106 
107     # end readPage  ---------------------------------------
108 
109     # 打开作品链接,遍历单章
110     def readPageThree(self, page_url, path):
111         soup = self.getSoup(page_url)  # 作品页面
112         print('readPageThree--', page_url)
113         a_list = soup.find('div', {'id': 'list'}).find_all('a')
114         idx = 0  # 序号
115         for a_href in a_list:
116             idx = idx + 1
117             href = self.index_page_url + a_href.get('href')
118             file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt'
119             print('file_a_href', href, '---file_path', file_path)
120 
121             '''
122             new_path = self.isTxt(file_path)
123             if new_path:
124                 print(new_path)
125                 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8')
126                 file_object.write(href+','+new_path)
127                 file_object.close()
128              '''
129             self.readPageFour(href, file_path)
130 
131             #self.href_list.append({'href': href, 'file_path': file_path})
132 
133             # 多线程
134             #t = threading.Thread(target=self.readPageFour, args={href, file_path})
135             #t.start()
136             #t.join(15)
137 
138     # end readPageThree  ---------------------------------------
139 
140     # 读取单章内容并写入
141     def readPageFour(self, page_url, path):
142         new_path = self.isTxt(path)  # 是否存在,存在则返回'',没创建则返回合法文件名
143         if new_path:
144             soup = self.getSoup(page_url)
145             con_div = soup.find('div', {'id': 'content'})  # 读取文本内容
146             content = con_div.get_text().replace('
', '\n').replace(' ', ' ') 147 # content = content.replace('&','').replace('amp;','').replace('rdquo;','').replace('ldquo;','') 148 # content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;") 149 150 self.writeTxt(new_path, content) # 写入文件 151 152 # end readPageFour --------------------------------------- 153 154 def readPageHtml(self, page_url, path): 155 soup = self.getSoup(page_url) 156 con_div = soup.find('div', {'id': 'content'}) 157 content = con_div.get_text().replace('
', '\n').replace(' ', ' ') 158 159 def createFolder(self, path): 160 path = path.strip() 161 # 去除尾部 \ 符号 162 path = path.rstrip("\\") 163 rstr = r"[\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' 164 new_path = re.sub(rstr, "_", path) # 替换为下划线 165 is_exists = os.path.exists(new_path) 166 # 不存在则创建 167 if not is_exists: 168 os.makedirs(new_path) 169 print('目录:', new_path + ' create') 170 else: 171 print(new_path + ' 目录已存在') 172 173 # end createFolder --------------------------------------- 174 175 def isTxt(self, path): 176 path = path.strip() 177 # 去除尾部 \ 符号 178 path = path.rstrip("\\") 179 rstr = r"[\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' 180 new_path = re.sub(rstr, "_", path) # 替换为下划线 181 isExists = os.path.exists(new_path) 182 if isExists: 183 print(new_path, '已存在') 184 return '' 185 else: 186 return new_path 187 188 # end createTxt --------------------------------------- 189 190 def writeTxt(self, file_name, content): 191 isExists = os.path.exists(file_name) 192 if isExists: 193 print(file_name, '已存在') 194 else: 195 file_object = open(file_name, 'w', encoding='utf-8') 196 file_object.write(content) 197 file_object.close() 198 199 # end writeTxt ------------------------------------------ 200 201 def run(self): 202 try: 203 self.readPageOne() 204 except BaseException as error: 205 print('error--', error) 206 207 def runTest(self): 208 try: 209 page_url = 'http://www.cuiweijuxs.com/4_4508/' 210 path = '小说/runTest' 211 self.readPageThree(page_url, path) 212 except BaseException as error: 213 print('error--', error) 214 215 def testRun(self,num,time_): 216 for i in range(3): 217 print('num=',num,ctime()) 218 sleep(time_) 219 220 def threadsRun(self): 221 222 #self.readPageOne(122) 223 224 for i in range(1,123): 225 page = str(i) 226 t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__) 227 #t = threading.Thread(target=self.testRun, args=( str(i) )) 228 self.threads.append(t) 229 230 for t in self.threads: 231 t.start() 232 for t in self.threads: 233 t.join() 234 #t.join() 235 236 print('all end: %s' % ctime()) 237 238 239 class MyThread(threading.Thread): 240 241 def __init__(self, func, args, name): 242 threading.Thread.__init__(self) 243 self.func = func 244 self.args = args 245 self.name = name 246 247 def run(self): 248 self.func(*self.args) 249 250 251 Capture().threadsRun()
View Code

 

  

 

你可能感兴趣的:(python3+beautifulSoup4.6抓取某网站小说(四)多线程抓取)