教师结构化面试,一键获取资料

教师结构化面试,一个程序员男朋友,为爱而码


【需求】

  1、教 结构化面试资料太少,而且最好打印

  2、资料保存word文件,方便共享


作为一名程序员,开始coding...

把如下代码保存为download.py,双击运行,结果如下:

教师结构化面试,一键获取资料_第1张图片

#coding=utf-8
from lxml import etree
import requests
from docx import Document
import re

class Download():
    def __init__(self):
        pass
    def getPageUrl(self):
        paperAll = {}
        for pageNum in range(1,31):
            url="http://wap.zgjsks.com/html/jszp/mianshi/jiegouhua/{}.html".format(pageNum)
            mPage = requests.get(url)
            selector = etree.HTML(mPage.content)  # 将源码转化为能被XPath匹配的格式
            paperList = selector.xpath("//*[contains(concat(' ', @class, ' '), 'recruit_right')]/b/a")
            for paper in paperList:
                paperUrl=paper.attrib['href']
                paperTitle=paper.text
                paperAll[paperUrl] = paperTitle
        return paperAll

    def getPagerNextMaxNum(self,paperUrl):
        content = requests.get(paperUrl)
        selector = etree.HTML(content.content)  # 将源码转化为能被XPath匹配的格式
        try:
            pagerNextMaxNum = int(selector.xpath("//*[contains(concat(' ', @class, ' '), 'fenye')]")[0].text.strip("\n\t").strip("()").split("/")[1])+1
        except:
            pagerNextMaxNum = 2
        return pagerNextMaxNum

    def getNodeText(self,nodeP):
        paperLines=''
        if nodeP.text!=None:
            paperLines = nodeP.text
        for childNode in nodeP.getchildren():
            if childNode.text != None:
                paperLines += childNode.text
            if childNode.tail!=None:
                paperLines +=childNode.tail
            self.getNodeText(childNode)
        return paperLines

    def download(self):
        paperAll= self.getPageUrl()
        for paperUrl,paperTitle in paperAll.items():
            #paperTitle="对“不要让孩子输在起跑线上”这种说法,你怎么"
            #paperUrl="http://wap.zgjsks.com/html/2017/jiegouhua_0526/232694.html"
            print("[*D]{} -- {}".format(paperTitle,paperUrl))
            pagerNextMaxNum =self.getPagerNextMaxNum(paperUrl)
            paperContent = []
            for pageNextUrlNum in range(1,pagerNextMaxNum):
                pageNextUrl=paperUrl.replace(".html","_{}.html".format(pageNextUrlNum))
                content=requests.get(pageNextUrl)
                selector = etree.HTML(content.content)  # 将源码转化为能被XPath匹配的格式
                paperList = selector.xpath("//*[contains(concat(' ', @class, ' '), 'article_box_info')]/p")
                for paper in paperList:
                    paperLine=self.getNodeText(paper)
                    paperContent.append(paperLine)

            document = Document()
            document.add_heading(paperTitle, 0)
            for paperLine in paperContent:
                if paperLine=='相关推荐:':
                    break
                if paperLine==None:
                    continue
                p = document.add_paragraph(paperLine)
            #'教师招聘面试指导|结构化面试——未来教育系统考'
            paperTitle = re.sub('[\/:*?"<>|]','-',paperTitle)
            document.save('{}.docx'.format(paperTitle))


if __name__=="__main__":
    downloadObj =Download()
    downloadObj.download()

你可能感兴趣的:(python)