思路:
1、抓取解析获取整个网站的所有小说
2、抓取解析小说的所有章节路径
3、抓取解析小说所有章节的内容生成TXT
1 import threading 2 import time 3 import urllib.request 4 from bs4 import BeautifulSoup 5 import re 6 import socket 7 8 #设置超时时间 9 socket.setdefaulttimeout(10) 10 11 def getHTMLData(strURL): 12 try: 13 data = urllib.request.urlopen(strURL).read() 14 return data 15 except: 16 print('i get the error') 17 data = urllib.request.urlopen(strURL).read() 18 return data 19 20 def getDataOnMatch(data, name, att, match): 21 soup = BeautifulSoup(data,"html.parser") 22 result = soup.find_all(name, att) 23 if match !="": 24 re_pat = re.compile(match) 25 list = re_pat.findall(str(result)) 26 return list 27 return result 28 29 #保存小说信息 30 class ListBookInfo: 31 def __init__(self): 32 self.lock = threading.Lock() 33 self.nLsBookCnt = 0 34 self.lsBookInfo = [] 35 36 def AddBookInfo(self, strBookName, strBookURL): 37 self.lock.acquire() 38 obj = [] 39 obj.append(strBookName) 40 obj.append(strBookURL) 41 self.lsBookInfo.append(obj) 42 self.nLsBookCnt += 1 43 self.lock.release() 44 45 def GetHeadBookInfo(self): 46 self.lock.acquire() 47 if self.nLsBookCnt > 0: 48 bookInfo = self.lsBookInfo[0] 49 del self.lsBookInfo[0] 50 self.nLsBookCnt -= 1 51 self.lock.release() 52 return bookInfo 53 else: 54 self.lock.release() 55 return 0 56 57 def GetSize(self): 58 self.lock.acquire() 59 nSize = self.nLsBookCnt 60 self.lock.release() 61 return nSize 62 63 def ClearLsBook(self): 64 self.lock.acquire() 65 self.lsBookInfo.clear() 66 self.nLsBookCnt = 0 67 self.lock.release() 68 69 #保存小说名、小说章节信息(章节名、章节URL) 70 class BookPageInfo: 71 def __init__(self): 72 self.lock = threading.Lock() 73 self.nBookPageCnt = 0 74 self.lsBookPageInfo = [] 75 76 def AddBookPageInfo(self, strBookName, lsBookPageURL): 77 self.lock.acquire() 78 obj = [] 79 obj.append(strBookName) 80 obj.append(lsBookPageURL) 81 self.lsBookPageInfo.append(obj) 82 self.nBookPageCnt += 1 83 self.lock.release() 84 85 def GetHeadBookPageInfo(self): 86 self.lock.acquire() 87 if self.nBookPageCnt > 0: 88 bookInfo = self.lsBookPageInfo[0] 89 del self.lsBookPageInfo[0] 90 self.nBookPageCnt -= 1 91 self.lock.release() 92 return bookInfo 93 else: 94 self.lock.release() 95 return 0 96 97 def GetSize(self): 98 self.lock.acquire() 99 nSize = self.nBookPageCnt 100 self.lock.release() 101 return nSize 102 103 def ClearLsBookPage(self): 104 self.lock.acquire() 105 self.lsBookPageInfo.clear() 106 self.nBookPageCnt = 0 107 self.lock.release() 108 109 def getArticleType(data): 110 soup = BeautifulSoup(data) 111 data_ul = soup.find_all("ul", "channel-nav-list") 112 print(data_ul) 113 re_pat = re.compile('\(.*)\<\/a\>') 114 list = re_pat.findall(str(data_ul)) 115 print(list) 116 for i in list: 117 print("%s-->%s" % (i[1], i[0])) 118 return list 119 120 def getArticle(strURL): 121 data = getHTMLData(strURL) 122 print(data) 123 ll = getDataOnMatch(data, "ul", "seeWell cf", "\(.*?)href=\"(.*?)\"(.*?)\<\/li\> ") 124 list = [] 125 for i in ll: 126 obj = [] 127 listReData = getDataOnMatch(getHTMLData(i[1]),"section","main b-detail", "(.*?)href=\"(.*?)\"(.*?)") 128 obj.append(listReData[0][1]) 129 lf = re.findall("(.*?)alt=\"(.*?)\"(.*?)", i[2]) 130 obj.append(lf[0][1]) 131 list.append(obj) 132 return list 133 134 def getArticlePageContent(strURL): 135 data = getHTMLData(strURL) 136 InfoList = getDataOnMatch(data, "div", "clearfix dirconone", "\(.*?)href=\"(.*?)\" title=\"(.*?)\"(.*?)\<\/li\> ") 137 pageInfoList = [] 138 for i in InfoList: 139 obj = [] 140 obj.append(strURL + '/' + i[1]) 141 obj.append(i[2]) 142 pageInfoList.append(obj) 143 return pageInfoList 144 145 def getArticleContent(strURL): 146 try: 147 data = getHTMLData(strURL) 148 ll = getDataOnMatch(data, "div", "mainContenr", "") 149 return ll 150 except: 151 print('i get the error') 152 data = getHTMLData(strURL) 153 ll = getDataOnMatch(data, "div", "mainContenr", "") 154 return ll 155 156 class CleverBookSys: 157 158 def __init__(self): 159 self.bExit = 0 160 self.eventBook = threading.Event() 161 self.eventPage = threading.Event() 162 self.lsBookInfo = ListBookInfo() 163 self.lsBookPageInfo = BookPageInfo() 164 self.thrParseBook = ThreadForParseAllBook(self, "ThreadForParseAllBook") 165 self.thrParseBookPage = ThreadForParseBookPage(self, "ThreadForParseBookPage") 166 nCount = 0 167 self.thrDownLoad = [] 168 #开十个线程用于下载,视网速而定 169 while nCount < 10: 170 thread = ThreadForDownloadTxt(self,"ThreadForDownloadTxt",nCount) 171 thread.start() 172 self.thrDownLoad.append(thread) 173 nCount += 1 174 self.thrParseBook.start() 175 self.thrParseBookPage.start() 176 177 #用与抓取整个网站的小说名及URL(没写完、大概写了一些) 178 class ThreadForParseAllBook(threading.Thread): 179 180 def __init__(self, parent, strThrName): 181 threading.Thread.__init__(self) 182 self.parent = parent 183 self.strThrName = strThrName 184 #这里只针对这个网站的解析 185 self.lsArticle = getArticleType(getHTMLData("http://www.quanshuwang.com/")) 186 187 def run(self): 188 print("Thread %s is Start!!!" % (self.strThrName)) 189 for art in self.lsArticle: 190 bookInfoList = getArticle(art[0]) 191 #【0】:小说路径,[1]:小说名 192 for bookInfo in bookInfoList: 193 self.parent.lsBookInfo.AddBookInfo(bookInfo[1], bookInfo[0]) 194 self.parent.eventBook.set() 195 196 #用于抓取单本小说的所以章节名及URL 197 class ThreadForParseBookPage(threading.Thread): 198 199 def __init__(self, parent, strThrName): 200 threading.Thread.__init__(self) 201 self.parent = parent 202 self.strThrName = strThrName 203 204 def run(self): 205 print("Thread %s is Start!!!" % (self.strThrName)) 206 while self.parent.bExit == 0: 207 nSize = self.parent.lsBookInfo.GetSize() 208 print("ThreadForParseBookPage-->%d" % nSize) 209 if (nSize > 0): 210 bookInfo = self.parent.lsBookInfo.GetHeadBookInfo() 211 print("ThreadForParseBookPage->%s" % bookInfo) 212 PageInfo = getArticlePageContent(bookInfo[1]) 213 print("ThreadForParseBookPage2->%s" % PageInfo) 214 self.parent.lsBookPageInfo.AddBookPageInfo(bookInfo[0], PageInfo) 215 self.parent.eventPage.set() 216 else: 217 print("self.parent.eventBook.wait()") 218 self.parent.eventBook.wait() 219 if(nSize <= 0): 220 self.parent.eventBook.clear() 221 print("self.parent.eventBook.run()") 222 223 #用于抓取单小说的所有章节内容并生成TXT文档保存 224 class ThreadForDownloadTxt(threading.Thread): 225 226 def __init__(self, parent, strThrName, nThrNO): 227 threading.Thread.__init__(self) 228 self.parent = parent 229 self.strThrName = strThrName 230 self.nThrNO = nThrNO 231 232 def run(self): 233 print("Thread %s%d is Start!!!" % (self.strThrName,self.nThrNO)) 234 while self.parent.bExit == 0: 235 nSize = self.parent.lsBookPageInfo.GetSize() 236 print("ThreadForDownloadTxt-->%d" % nSize) 237 if (nSize > 0): 238 bookPageInfoList = self.parent.lsBookPageInfo.GetHeadBookPageInfo() 239 print("ThreadForDownloadTxt%d-->%s" % (self.nThrNO,bookPageInfoList)) 240 fileName = 'D:\\txt\\' + bookPageInfoList[0] + '.txt' 241 file_object = open(fileName, 'w',encoding='utf-8') 242 for pageInfo in bookPageInfoList[1]: 243 print("ThreadForDownloadTxt%d-->%s" % (self.nThrNO,pageInfo)) 244 content = getArticleContent(pageInfo[0]) 245 print(content) 246 file_object.write(pageInfo[1]) 247 file_object.write(str(content)) 248 file_object.flush() 249 file_object.close() 250 else: 251 print("self.parent.eventPage.wait()") 252 self.parent.eventPage.wait() 253 if(nSize <= 0): 254 self.parent.eventPage.clear() 255 print("self.parent.eventPage.run()") 256 257 cleverBook = CleverBookSys() 258 259 260 261 262 263 264 265
缺点:
1、学习PYTHON两天很多语法以及基础的细节不明白
2、对于不同网站的抓取解析有变动