Python实现爬取西安电子科技大学所有教师个人简介以及简历
基于Sublime Text 3运行,整体实现比较简单。
如果需要使用,直接粘贴源码即可,可以修改对应的循环控制值或者直接只复制某一个函数来实现爬取的功能。
自己已经大三,对网络安全方面比较有兴趣,而西安电子科技大学又是网络安全的强校,所以想看一下相关的导师,顺便锻炼一下自己写脚本的能力。
本文所爬取到的信息均是西安电子科技大学学校网页上的公开信息,如果侵权,立即删除。
下面是代码段
#coding=utf-8
import re #正则匹配运行库
import requests
from bs4 import BeautifulSoup #加载美味汤运行库
import time
import random
def visit():
url='https://faculty.xidian.edu.cn/yxsz.jsp?urltype=tree.TreeTempUrl&wbtreeid=1020'
content=requests.get(url)
soup=BeautifulSoup(content.text,'lxml')
final_str=''
for x in soup.find_all('div',attrs={
"class":"li-b"}):
final_str+=str(x)
final_str1=final_str.replace('&','&')
college_url=re.findall('xyjslb.jsp.*urltype=tsites.CollegeTeacherList&wbtreeid=1020&st=0&id=.*&lang=zh_CN#collegeteacher',final_str1)
return college_url
def replace_special_str(line):
for ch in line:
if ch in "u[]|\'":
line = line.replace(ch, "")
return line
def visit_every_college(college_url):
content=requests.get(college_url)
rule=' 1/(.*) '
Number_of_page=re.findall(rule,content.text)
Number_of_page=replace_special_str(str(Number_of_page))
Number_of_page=int(Number_of_page)
match_rule='http://faculty.xidian.edu.cn/.*/zh_CN/index.htm'
for i in range(1,(Number_of_page+1),1):
aim_url='https://faculty.xidian.edu.cn/xyjslb.jsp?totalpage='+str(Number_of_page)+'&PAGENUM='+str(i)+'&urltype=tsites.CollegeTeacherList&wbtreeid=1001&st=0&id=1601&lang=zh_CN'
final_content=re.findall(match_rule,content.text)
return final_content
def spide_teacher_Intro(url_i):
result=requests.get(url_i)
time.sleep(random.randint(0,3))
soup=BeautifulSoup(result.text,"lxml")
str2=''
str3=''
div=soup.find("div",attrs={
"class":"p_l_nr"})
if div==None:
str1='该老师无法爬取,请手动观看'
else:
str1=div.text
div1=soup.find("div",attrs={
"class":"t_jbxx_nr"})
str2=div1.text
div2=soup.find("div",attrs={
"class":"t_grjj_nr"})
str="程序中没有编写对应的标签,请进行手动访问"
if div2==None:
div2=soup.find("div",class_="p_r_nr")
if div2==None:
str3=''
else:
str3=div2.text
final_str=str1.strip()+str2.strip()+str3
return final_str
def write_file(filename,final_content):
f= open(filename,'w')
double_teacher_num=len(final_content)
print (final_content)
#以写的方式打开文件,编码方式是utf-8
for i in range(0,double_teacher_num,2):#从0开始,步长为2
this_teacher=spide_teacher_Intro(final_content[i])
print (i)
f.write(u'\r\n')
link="第"+str(int((i/2)+1))+"个老师的个人简介:"
f.write(link)
f.write(u'\r\n')
f.write(this_teacher.replace(u'\xa0',u' '))
f.write(u'\r\n')
f.close()
def obt_filename():
url='https://faculty.xidian.edu.cn/yxsz.jsp?urltype=tree.TreeTempUrl&wbtreeid=1020'
content=requests.get(url)
soup=BeautifulSoup(content.text,"lxml")
second_content=''
for x in soup.find_all('div',attrs={
"class":"li-b"}):
second_content+=str(x)
rule='collegeteacher">(.*)'
college_name=re.findall(rule,second_content)
return college_name
def main():
college_list=visit() #先获取所有二级学院的列表
print ('Successful access to college list')
print (college_list)
college_num=len(college_list) #获取学院个数
print ('Number of successful Colleges')
print (college_num)
#现在可以确认爬取所有学院是没有问题的
college_name=obt_filename() #获取所有学院名
print ('Successfully obtained all college names')
print (college_name)
#上面获取学院名也没有问题,下面就是分别读取每一页的内容然后写文件出了问题
for i in range(college_num-1):
#这一步是遍历学院
college_url_aim="https://faculty.xidian.edu.cn/"+college_list[i]
teacher_url=visit_every_college(college_url_aim)
print (len(teacher_url))
print (i)
print (college_name[i])
i=str(i)
file_name="D:\\pyjy\\"+str(college_name[int(i)])+'.txt'
write_file(file_name,teacher_url) #将这个系的系名,还有所有老师的url传进写文件的函数
if __name__ == '__main__':
main()