教师结构化面试,一个程序员男朋友,为爱而码
【需求】
1、教 结构化面试资料太少,而且最好打印
2、资料保存word文件,方便共享
作为一名程序员,开始coding...
把如下代码保存为download.py,双击运行,结果如下:
#coding=utf-8
from lxml import etree
import requests
from docx import Document
import re
class Download():
def __init__(self):
pass
def getPageUrl(self):
paperAll = {}
for pageNum in range(1,31):
url="http://wap.zgjsks.com/html/jszp/mianshi/jiegouhua/{}.html".format(pageNum)
mPage = requests.get(url)
selector = etree.HTML(mPage.content) # 将源码转化为能被XPath匹配的格式
paperList = selector.xpath("//*[contains(concat(' ', @class, ' '), 'recruit_right')]/b/a")
for paper in paperList:
paperUrl=paper.attrib['href']
paperTitle=paper.text
paperAll[paperUrl] = paperTitle
return paperAll
def getPagerNextMaxNum(self,paperUrl):
content = requests.get(paperUrl)
selector = etree.HTML(content.content) # 将源码转化为能被XPath匹配的格式
try:
pagerNextMaxNum = int(selector.xpath("//*[contains(concat(' ', @class, ' '), 'fenye')]")[0].text.strip("\n\t").strip("()").split("/")[1])+1
except:
pagerNextMaxNum = 2
return pagerNextMaxNum
def getNodeText(self,nodeP):
paperLines=''
if nodeP.text!=None:
paperLines = nodeP.text
for childNode in nodeP.getchildren():
if childNode.text != None:
paperLines += childNode.text
if childNode.tail!=None:
paperLines +=childNode.tail
self.getNodeText(childNode)
return paperLines
def download(self):
paperAll= self.getPageUrl()
for paperUrl,paperTitle in paperAll.items():
#paperTitle="对“不要让孩子输在起跑线上”这种说法,你怎么"
#paperUrl="http://wap.zgjsks.com/html/2017/jiegouhua_0526/232694.html"
print("[*D]{} -- {}".format(paperTitle,paperUrl))
pagerNextMaxNum =self.getPagerNextMaxNum(paperUrl)
paperContent = []
for pageNextUrlNum in range(1,pagerNextMaxNum):
pageNextUrl=paperUrl.replace(".html","_{}.html".format(pageNextUrlNum))
content=requests.get(pageNextUrl)
selector = etree.HTML(content.content) # 将源码转化为能被XPath匹配的格式
paperList = selector.xpath("//*[contains(concat(' ', @class, ' '), 'article_box_info')]/p")
for paper in paperList:
paperLine=self.getNodeText(paper)
paperContent.append(paperLine)
document = Document()
document.add_heading(paperTitle, 0)
for paperLine in paperContent:
if paperLine=='相关推荐:':
break
if paperLine==None:
continue
p = document.add_paragraph(paperLine)
#'教师招聘面试指导|结构化面试——未来教育系统考'
paperTitle = re.sub('[\/:*?"<>|]','-',paperTitle)
document.save('{}.docx'.format(paperTitle))
if __name__=="__main__":
downloadObj =Download()
downloadObj.download()