Python经典基础习题(网络爬虫)

1.批量爬取yuan士信息,把每位yuan士的文字介绍保存到该yuan士名字为名的记事本文件中,照片保存到该院士名字为名的jpg文件中。

程序代码:

from urllib.request import urlopen
from re import findall
import requests
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
    content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
    if urls[-4:-1] == "htm":
        if urls[15:20] == "colys":
            end.append('https://www.cae.cn'+urls)
pattern_jpg = 'img src="(.+?)" style="width:150px;height:210px;"'
pattern_message = '

    (.+?)

 

    (.+?)

' pattern_name = '
(.+?)
' for urls1 in end: with urlopen(urls1) as file: content1 = file.read().decode() result_jpg = findall(pattern_jpg,content1) result_message = findall(pattern_message,content1) result_name = findall(pattern_name,content1) for a in result_jpg: result_jpg_str = "".join(a) for b in result_message: result_message_str = "".join(b) for c in result_name: result_name_str = "".join(c) with open(result_name_str+'.txt','w',encoding="utf-8") as file_message: file_message.write(result_message_str) file_message.close() with open(result_name_str+'.jpg','wb') as file_jpg: url_new ="https://www.cae.cn/"+result_jpg_str r = requests.get(url_new) file_jpg.write(r.content) file_jpg.close()


2.根据院士名单,爬取该yuan士性别,族别信息;根据yuan士简介提取该院士就读本科学校,入选院士年份;将院士姓名,性别,族别信息,本科学校,入选yuan士年份信息写入excel文件。

程序代码:

from urllib.request import urlopen
from re import findall
import  openpyxl
from openpyxl import Workbook
fn = r'D:\message.xlsx'
wb = Workbook()
ws = wb.create_sheet(title="工程院士信息")
ws['A1'] = '姓名'
ws['B1'] = '性别'
ws['C1'] = '民族'
ws['D1'] = '毕业院校'
ws['E1'] = '入选年份'
wb.save(fn)
wb = openpyxl.load_workbook(fn)
ws = wb.worksheets[1]
url = 'https://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(url) as file:
    content = file.read().decode()
pattern = 'href="(.+?)"'
result = findall(pattern,content)
end = []
for urls in result:
    if urls[-4:-1] == "htm":
        if urls[15:20] == "colys":
            end.append('https://www.cae.cn'+urls)
pattern_message_year = '

    (.+?)

 

    (.+?)

' \ '(

 

    (.+?)

)*' pattern_message = '

    (.+?)

 

    (.+?)

' pattern_next_url = '' count = 1 number = 2 compare = [] count_year = 0 number_year = 2 count_number = 2 for url_new in end: if number == 41: break with urlopen(url_new) as file: content1 = file.read().decode() result_new_url = findall(pattern_next_url, content1) with urlopen(result_new_url[0]) as file_enter: file_enter_url = file_enter.read().decode() pattern_name = '(:

|:)(.+?)(

|
)' message = findall(pattern_name, file_enter_url) ws.cell(row=count_number, column=1, value=message[0][1]) ws.cell(row=count_number, column=3, value=message[1][1]) ws.cell(row=count_number, column=2, value=message[2][1]) count_number += 1 wb.save(fn) result_message = findall(pattern_message,content1) result_message_year = findall(pattern_message_year,content1) number += 1 for i in result_message: pattern_study = '毕业于(.+?)大学' for j in i: study = findall(pattern_study,j) for end in study: endd = end + "大学" if count == 40: break if len(endd) in range(4,10): count += 1 ws.cell(row=count,column=4,value=endd) wb.save(fn) else: count += 1 for year in result_message_year[0]: pattern_enter = '\d{4}年当选' enter = findall(pattern_enter, year) enter_year = "".join(enter) if len(enter_year) != 0: compare.append(enter_year) count_year += 1 if count_year == 2: if compare[0] == compare [1]: ws.cell(row=number_year, column=5, value=compare[0][:-3]) wb.save(fn) number_year += 1 compare = [] count_year = 0 else: ws.cell(row=number_year, column=5, value=compare[0][:-3]) number_year += 1 wb.save(fn) del compare[0] count_year -= 1

你可能感兴趣的:(Python,python,爬虫)