记录者:zff
import urllib.request
from bs4 import BeautifulSoup
import os
from docx import Document
class Spider(object):
def init(self):
self.url = ""
self.page = []
self.dirName =r"C:\Users\Administrator\Desktop"
self.teacher_name_all_text = []
def common_spider(self,url):
page = urllib.request.urlopen(url)
html = page.read().decode("utf8")
html = BeautifulSoup(html,"lxml")
return html
def one_stage_spider(self,one_stage_url):
one_stage_html = self.common_spider(one_stage_url)
one_stage_html = one_stage_html.find_all("ul",attrs={"class":"child"})
one_stage_text = [h["href"] for h in one_stage_html[2].find_all("a")]
return one_stage_text
def two_stage_spider(self,two_stage_url):
two_stage_text = []
for i in range(0,4):
two_stage_all_url = "" +two_stage_url[i]
teacher_name_all_text = self.get_teacher_name(two_stage_all_url)
two_stage_html = self.common_spider(two_stage_all_url)
two_stage_html = two_stage_html.find_all("div",attrs={"class":"right02 border-bt"})
all_url = [h1["href"] for h1 in two_stage_html[0].find_all("a")]
two_stage_text.append(all_url)
all_url = []
return two_stage_text,teacher_all_name_text
def get_class_name(self,url):
gain_name = self.common_spider(url)
class_name = gain_name.find_all("ul",attrs={"class":"child"})
class_name = class_name[2].text.strip().split("\n")
return class_name
def get_teacher_name(self,url):
gain = self.common_spider(url)
teacher_name = gain.find_all("div",attrs={"class":"right"})
for gain_teacher_name in teacher_name:
teacher_name_text = gain_teacher_name.ul.text.strip().split("\n")
teacher_name_all_text.append(teacher_name_text)
return teacher_name_all_text
def mkdir(self,path):
path = path.strip()
print("创建目录 %s" %path)
if os.path.exists(path):
return False
else:
os.makedirs(path)
return True
def get_teacher_source(self,teacher_source_url):
teacher_source = self.common_spider(teacher_source_url)
gain_teacher = teacher_source.find_all("div",attrs={"id":"news1"})
teacher_source_text = []
for row in gain_teacher:
tablecells = row.find_all('span')
for cell in tablecells[1]:
teacher_source_text.append(cell.get_text().strip())
teacher_source_text = "".join(teacher_source_text)
teacher_source_text = teacher_source_text.replace("?xml:namespace>","")
def get_teacher_image(self,teacher_image_url):
image = self.common_spider(teacher_image_url)
teacher_image = image.find_all("img")[0]
return teacher_image["src"]
def mkdoc(self,paragraph,picture_path,path_name):
doc = Document()
doc_paragraph = doc.add_paragraph(paragraph)
doc_picture = doc.add_picture(picture_path)
doc.save(path_name)
def third_stage_spider(self,third_stage_url):
image_all = []
teacher_all_source = []
for i in range(0,len(third_stage_url)):
for j in range(0,len(third_stage_url[i])):
url_zone = "" + third_stage_url[i][j]
teacher_image = self.get_teacher_image(url_zone)
image_all.append(teacher_image)
teacher_all_source = self.get_teacher_source(url_zone)
teacher_all_source.append(teacher_all_source)
return image_all,teacher_all_source
def image_save(self,image_teacher,teacher_nametext):
for i in range(0,len(image_teacher)):
url_image = "" + image_teacher[i]
image_item = urllib.request.urlopen(url_image).read()
photo_path = teacher_photo_path +"/" + teacher_name_all_text[i]
f = open(photo_path,"wb")
f.write(image_item)
f.close()
def main_spider(self,url):
one_stage_text = self.one_stage_spider(url)
print(one_satge_text)
class_name = self.get_class_name(url)
two_stage_text,teacher_name_all_text = self.two_stage_spider(one_stage_text)
image_all,teacher_all_source = self.third_stage_spider(two_stage_text)
self.mkdir(dirName)
for class_name_path in class_name:
dir_path = dirName+"/"+class_name_path
self.mkdir(dir_path)
teacher_photo_path = dirName + "/" + "photo"
self.mkdir(teacher_photo_path)
self.image_save(image_all,teacher_name_all_text)
for i in range(0,len(teacher_all_source)):
self.mkdoc(teacher_all_source[i],os.listdir(r"C:\Users\Administrator\Desktop\photo")
,r"C:\Users\Administrator\Desktop\photo\i.docx")
s = Spider()
s.main_spider(url="")