程序代码:
from urllib.request import urlopen
from re import findall
import requests
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
if urls[-4:-1] == "htm":
if urls[15:20] == "colys":
end.append('https://www.cae.cn'+urls)
pattern_jpg = 'img src="(.+?)" style="width:150px;height:210px;"'
pattern_message = ' (.+?)
(.+?)
'
pattern_name = '(.+?)'
for urls1 in end:
with urlopen(urls1) as file:
content1 = file.read().decode()
result_jpg = findall(pattern_jpg,content1)
result_message = findall(pattern_message,content1)
result_name = findall(pattern_name,content1)
for a in result_jpg:
result_jpg_str = "".join(a)
for b in result_message:
result_message_str = "".join(b)
for c in result_name:
result_name_str = "".join(c)
with open(result_name_str+'.txt','w',encoding="utf-8") as file_message:
file_message.write(result_message_str)
file_message.close()
with open(result_name_str+'.jpg','wb') as file_jpg:
url_new ="https://www.cae.cn/"+result_jpg_str
r = requests.get(url_new)
file_jpg.write(r.content)
file_jpg.close()
程序代码:
from urllib.request import urlopen
from re import findall
import openpyxl
from openpyxl import Workbook
fn = r'D:\message.xlsx'
wb = Workbook()
ws = wb.create_sheet(title="工程院士信息")
ws['A1'] = '姓名'
ws['B1'] = '性别'
ws['C1'] = '民族'
ws['D1'] = '毕业院校'
ws['E1'] = '入选年份'
wb.save(fn)
wb = openpyxl.load_workbook(fn)
ws = wb.worksheets[1]
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
if urls[-4:-1] == "htm":
if urls[15:20] == "colys":
end.append('https://www.cae.cn'+urls)
pattern_message_year = ' (.+?)
(.+?)
' \
'(
(.+?)
)*'
pattern_message = ' (.+?)
(.+?)
'
pattern_next_url = ''
count = 1
number = 2
compare = []
count_year = 0
number_year = 2
count_number = 2
for url_new in end:
if number == 41:
break
with urlopen(url_new) as file:
content1 = file.read().decode()
result_new_url = findall(pattern_next_url, content1)
with urlopen(result_new_url[0]) as file_enter:
file_enter_url = file_enter.read().decode()
pattern_name = '(:|:)(.+?)(