Python的爬虫

记录者:zff

import urllib.request
from bs4 import BeautifulSoup
import os
from docx import Document

class Spider(object):
def init(self):
self.url = ""
self.page = []
self.dirName =r"C:\Users\Administrator\Desktop"
self.teacher_name_all_text = []

  def common_spider(self,url):
      page = urllib.request.urlopen(url)
      html = page.read().decode("utf8")
      html = BeautifulSoup(html,"lxml")
      return html

  def one_stage_spider(self,one_stage_url):
      one_stage_html = self.common_spider(one_stage_url)
      one_stage_html = one_stage_html.find_all("ul",attrs={"class":"child"})
      one_stage_text = [h["href"] for h in one_stage_html[2].find_all("a")]
      return one_stage_text

  def two_stage_spider(self,two_stage_url):
      two_stage_text = []
      for i in range(0,4):
          two_stage_all_url = "" +two_stage_url[i]
        
          teacher_name_all_text = self.get_teacher_name(two_stage_all_url)
          two_stage_html = self.common_spider(two_stage_all_url)
          two_stage_html = two_stage_html.find_all("div",attrs={"class":"right02 border-bt"})
          all_url = [h1["href"] for h1 in two_stage_html[0].find_all("a")]
          two_stage_text.append(all_url)
          all_url = []
      return two_stage_text,teacher_all_name_text

  def get_class_name(self,url):
      gain_name = self.common_spider(url)
      class_name = gain_name.find_all("ul",attrs={"class":"child"})
      class_name = class_name[2].text.strip().split("\n")
    
      return class_name

  def get_teacher_name(self,url):
      gain = self.common_spider(url)
      teacher_name = gain.find_all("div",attrs={"class":"right"})
      for gain_teacher_name in teacher_name:
          teacher_name_text = gain_teacher_name.ul.text.strip().split("\n")
          teacher_name_all_text.append(teacher_name_text)
      return teacher_name_all_text
   
  def mkdir(self,path):
      path = path.strip()
      print("创建目录 %s" %path)
      if os.path.exists(path):
          return False
      else:
          os.makedirs(path)
          return True

  def get_teacher_source(self,teacher_source_url):
      teacher_source = self.common_spider(teacher_source_url)
      gain_teacher = teacher_source.find_all("div",attrs={"id":"news1"})
      teacher_source_text = []
      for row in gain_teacher:
          tablecells = row.find_all('span')
          for cell in tablecells[1]:
              teacher_source_text.append(cell.get_text().strip())
      teacher_source_text = "".join(teacher_source_text)
      teacher_source_text = teacher_source_text.replace("?xml:namespace>","")

  def get_teacher_image(self,teacher_image_url):
      image = self.common_spider(teacher_image_url)
      teacher_image = image.find_all("img")[0]
      return teacher_image["src"]



  def mkdoc(self,paragraph,picture_path,path_name):
      doc = Document()
      doc_paragraph = doc.add_paragraph(paragraph)
      doc_picture = doc.add_picture(picture_path)
      doc.save(path_name)

  def third_stage_spider(self,third_stage_url):
      image_all = []
      teacher_all_source = []
      for i in range(0,len(third_stage_url)):
          for j in range(0,len(third_stage_url[i])):
              url_zone = "" + third_stage_url[i][j]
             teacher_image = self.get_teacher_image(url_zone)
              image_all.append(teacher_image)
              teacher_all_source = self.get_teacher_source(url_zone)
              teacher_all_source.append(teacher_all_source)
      return image_all,teacher_all_source

  def image_save(self,image_teacher,teacher_nametext):
      for i in range(0,len(image_teacher)):
          url_image = "" + image_teacher[i]
          image_item = urllib.request.urlopen(url_image).read()
          photo_path = teacher_photo_path +"/" + teacher_name_all_text[i]
        
          f = open(photo_path,"wb")
          f.write(image_item)
          f.close()
        
            
  def main_spider(self,url):
      one_stage_text = self.one_stage_spider(url)
      print(one_satge_text)
      class_name = self.get_class_name(url)
    
      two_stage_text,teacher_name_all_text = self.two_stage_spider(one_stage_text)
      image_all,teacher_all_source = self.third_stage_spider(two_stage_text)
      self.mkdir(dirName)
      for class_name_path in class_name:
          dir_path = dirName+"/"+class_name_path
          self.mkdir(dir_path)
      teacher_photo_path = dirName + "/" + "photo"
      self.mkdir(teacher_photo_path)
      self.image_save(image_all,teacher_name_all_text)
      for i in range(0,len(teacher_all_source)):
              self.mkdoc(teacher_all_source[i],os.listdir(r"C:\Users\Administrator\Desktop\photo")
                   ,r"C:\Users\Administrator\Desktop\photo\i.docx")

s = Spider()
s.main_spider(url="")

你可能感兴趣的:(Python的爬虫)