爬取代码为import urllib.request
from bs4 import BeautifulSoup
#coding: utf-8
class xiaoShuo():
def __init__(self,url,parLabelValue,parLabelType,parLabel,clildLabelValue,clildLabelType,clildLabel,enc):
self.url = url;
self.parLabelValue = parLabelValue;
self.parLabelType = parLabelType;
self.enc=enc;
self.parLabel = parLabel;
self.clildLabelValue = clildLabelValue;
self.clildLabelType = clildLabelType;
self.clildLabel = clildLabel;
def getUrlContent(self):
response = urllib.request.urlopen(self.url);
html = response.read().decode(self.enc);
pageNode = BeautifulSoup(html, 'html.parser')
iterms = pageNode.find_all(self.parLabel,{self.parLabelType:self.parLabelValue})
for i in range(len(iterms)):
tagA = iterms[i].select("a");
for j in range(len(tagA)):
# print("%s:\t%s"%(tagA[j].get_text(),tagA[j].get("href")))
content = self.getXiaoShuoContent(self.url,self.clildLabel,self.clildLabelValue,self.clildLabelType,self.enc)
print(content)
def getXiaoShuoContent(self,url,childLabel,childLabelValue,childLabelType,enc):
response = urllib.request.urlopen(url);
html = response.read().decode(enc);
pageNode = BeautifulSoup(html, 'html.parser')
iterms = pageNode.find_all(childLabel, {childLabelType: childLabelValue})
content = "";
for i in range(len(iterms)):
content = iterms[i].get_text(),
return content;
def writeTofile(self,fileName,content):
try:
with open("%s.txt" %(fileName), "w") as f: # 格式化字符串还能这么用!
for i in content:
f.write(i)
except:
print("写入错误")
a = xiaoShuo("https://www.szzyue.com/dushu/11/11255/","L","class","td","contents","id","dd","gbk");
html = a.getUrlContent();
# print(html)