用beautifulsoup提取
import requests
from bs4 import BeautifulSoup
import csv
URL = "http://zsb.hitwh.edu.cn/home/major/index"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract data from the table
rows = soup.select('tbody tr')
data = []
for row in rows:
# Getting major name
major_name = row.select_one('.left-td') # .left-td的意思是class为left-td的标签
if major_name:
major_name = major_name.get_text(strip=True) # .get_text(strip=True)的意思是获取标签中的文本内容
# Getting category
category = row.select_one('.text-center') # .text-center的意思是class为text-center的标签
if category:
category = category.get_text(strip=True)
# Getting subject requirements
subjects = row.select('td.text-center') # td.text-center的意思是td标签中class为text-center的标签
subject_req = [subj.get_text(strip=True) for subj in subjects[1:]] if len(subjects) > 1 else [] # 这行代码的意思是获取subjects中从第二个元素开始的所有元素
# Getting college details
college_detail = row.select_one('td > a') # td > a的意思是td标签下的a标签
college_name = college_detail.get_text(strip=True) if college_detail else None
college_link = college_detail['href'] if college_detail else None
# Getting major details
major_detail = row.select_one('.right-td > a') # .right-td > a的意思是class为right-td的标签下的a标签
major_detail_name = major_detail.get_text(strip=True) if major_detail else None
major_detail_link = major_detail['href'] if major_detail else None
# Appending data to list
data.append({
"Major Name": major_name,
"Category": category,
"Subject Requirements": subject_req,
"College Name": college_name,
"College Link": college_link,
"Major Detail Name": major_detail_name,
"Major Detail Link": major_detail_link
})
with open('major_info.csv', 'w', newline='') as csvfile:
fieldnames = ['Major Name', 'Category', 'Subject Requirements', 'College Name', 'College Link', 'Major Detail Name', 'Major Detail Link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # DictWriter的意思是将字典写入csv文件
writer.writeheader() # writeheader的意思是写入表头
for item in data:
writer.writerow(item)
爬出来的问题在于,学院信息那里跨行的地方没有
如果我想把空缺的地方填满该咋做?
逻辑是每一个tr放到一个row变量里面,然后爬取,那么我们每个row变量长啥样,我存到文件里康康:
rows = soup.select('tbody tr')
data = []
# 把rows[0]存到txt文件中
with open('major_info.txt', 'w', encoding='utf-8') as f:
f.write(str(rows[0]))
我们只需要把rowspan里面的值利用好就行
使用 rowspan 属性来处理在HTML表格中跨多行的字段。当字段在多行中都保持不变时,它会具有 rowspan 属性。
为了处理 rowspan,我们为每一个可能有 rowspan 的字段维护两个外部变量:一个用来存储当前值,另一个用来追踪剩余的行数。
每次循环时,我们都会检查是否还有剩余的行数。如果有,我们就使用当前的值并减少剩余的行数。否则,我们就从当前行提取新的值。
# 导入必要的库
import requests
from bs4 import BeautifulSoup
import csv
# 设置要抓取数据的网址
URL = "http://zsb.hitwh.edu.cn/home/major/index"
# 使用requests库获取网页内容
response = requests.get(URL)
# 使用BeautifulSoup解析网页内容,选择使用'html.parser'作为解析器
soup = BeautifulSoup(response.content, 'html.parser')
# 从HTML中选择所有tbody下的tr标签,这些标签代表了表格中的每一行
rows = soup.select('tbody tr')
# 初始化一个空列表来存储提取出来的数据
data = []
# 对于那些带有rowspan属性的字段,我们需要定义一些外部的变量来追踪其值和它们应该持续的行数
# 这样我们可以在接下来的行中重用这些值
# 专业名称相关变量
remaining_rows_major_name = 0
current_major_name = None
# 类别相关变量
remaining_rows_category = 0
current_category = None
# 必修课程相关变量
remaining_rows_subjects = 0
current_subjects = None
# 学院详情相关变量
remaining_rows_college_detail = 0
current_college_name = None
current_college_link = None
# 遍历每一行,从中提取所需的信息
for row in rows:
# 以下部分的逻辑是为了处理带有rowspan属性的字段
# 如果一个字段的值在多行中都是相同的(由rowspan属性决定),我们只在第一行中提取它
# 然后在接下来的行中重用这个值,直到达到rowspan指定的行数
# 获取专业名称
if remaining_rows_major_name > 0:
major_name = current_major_name
remaining_rows_major_name -= 1
else:
major_name_ele = row.select_one('.left-td')
if major_name_ele:
major_name = major_name_ele.get_text(strip=True)
current_major_name = major_name
if 'rowspan' in major_name_ele.attrs:
remaining_rows_major_name = int(major_name_ele['rowspan']) - 1
# 获取类别
if remaining_rows_category > 0:
category = current_category
remaining_rows_category -= 1
else:
category_ele = row.select_one('.text-center')
if category_ele:
category = category_ele.get_text(strip=True)
current_category = category
if 'rowspan' in category_ele.attrs:
remaining_rows_category = int(category_ele['rowspan']) - 1
# 获取必修课程
if remaining_rows_subjects > 0:
subject_req = current_subjects
remaining_rows_subjects -= 1
else:
subjects = row.select('td.text-center')
subject_req = [subj.get_text(strip=True) for subj in subjects[1:]] if len(subjects) > 1 else []
current_subjects = subject_req
if subjects and 'rowspan' in subjects[0].attrs:
remaining_rows_subjects = int(subjects[0]['rowspan']) - 1
# 获取学院详情
if remaining_rows_college_detail > 0:
college_name = current_college_name
college_link = current_college_link
remaining_rows_college_detail -= 1
else:
college_detail = row.select_one('td[rowspan] > a')
if college_detail:
college_name = college_detail.get_text(strip=True)
college_link = college_detail['href']
current_college_name = college_name
current_college_link = college_link
if 'rowspan' in college_detail.find_parent().attrs:
remaining_rows_college_detail = int(college_detail.find_parent()['rowspan']) - 1
# 获取专业详情名称和链接
major_detail = row.select_one('.right-td > a')
major_detail_name = major_detail.get_text(strip=True) if major_detail else None
major_detail_link = major_detail['href'] if major_detail else None
# 将提取的数据保存为一个字典,并将其添加到data列表中
data.append({
"Major Name": major_name,
"Category": category,
"Subject Requirements": subject_req,
"College Name": college_name,
"College Link": college_link,
"Major Detail Name": major_detail_name,
"Major Detail Link": major_detail_link
})
# 使用csv模块将data列表写入CSV文件
with open('major_info3.csv', 'w', newline='') as csvfile:
fieldnames = ['Major Name', 'Category', 'Subject Requirements',
'College Name', 'College Link', 'Major Detail Name', 'Major Detail Link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in data:
writer.writerow(item)
逻辑
这样的话,第一次遇到有rowspan属性,不仅提取了内容(get-text),还记下了循环次数 也就是上图中圈出来的地方,接下来rowspan-1次循环,每循环一次,减少一次remaining次数,并且赋值给major_name