在当今这个信息爆炸的时代,数据的获取和分析变得越来越重要。作为一名数据分析师,我经常需要从各种网站抓取数据来进行研究。最近,我接手了一个项目,需要抓取上海软科排名的数据。上海软科排名是一个权威的大学排名,包含了全球大学的学术排名信息。本文将详细介绍我如何使用Python编写爬虫程序来自动化抓取这些数据。
在开始编码之前,我们需要准备以下环境和库:
我们的目标URL是上海软科排名的官方网站:https://www.shanghairanking.cn/rankings/bcsr/2023
。我们需要从这个页面抓取所有学科的排名信息。
通过分析网页的HTML结构,我们确定了数据所在的标签和类名。每个学科的排名页面都有一个唯一的URL,我们可以从首页获取这些URL。
为了提高爬虫的效率,我们采用了多线程的设计。每个线程负责抓取一个学科的排名数据,并将数据放入队列中。
import requests
from lxml import html
import csv
import time
from queue import Queue
from fake_useragent import UserAgent
import threading
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
base_url = "https://www.shanghairanking.cn/rankings/bcsr/2023"
headers = {
'User-Agent': UserAgent().random
}
def fetch_subjects_list():
session = requests.Session()
num_retries = 3
while num_retries > 0:
try:
response = session.get(base_url, headers=headers, timeout=10)
response.raise_for_status()
tree = html.fromstring(response.content)
subject_list = []
subject_items = tree.xpath('//div[@class="subject-item"]')
for subject in subject_items:
cip_code = subject.xpath('.//span[@class="subject-code"]/text()')[0].strip()
cip_title = subject.xpath('.//div[@class="subject-title"]/text()')[0].strip()
cip_full_title = f'{cip_code}{cip_title}'
sub_items = subject.xpath('.//div[@class="subject-list"]//a')
for item in sub_items:
sub_code = item.xpath('.//span[1]/text()')[0].strip()
sub_title = item.xpath('.//span[2]/text()')[0].strip()
sub_full_title = f'{sub_code}{sub_title}'
sub_url = f"{base_url}/{sub_code}"
subject_list.append({
'cip_full_title': cip_full_title,
'sub_full_title': sub_full_title,
'sub_url': sub_url
})
logging.info("成功获取首页数据")
return subject_list
except requests.RequestException as e:
logging.error(f"获取首页时出错: {e}")
num_retries -= 1
time.sleep(5) # 等待5秒后重试
logging.error("经过多次尝试后,未能获取到首页数据")
raise Exception("经过多次尝试后,未能获取到首页数据")
def extract_data_from_details_page(details_html_content):
tree = html.fromstring(details_html_content)
rows = tree.xpath('//tr')
data = []
for row in rows:
try:
rank_2023 = row.xpath('.//td/div[@class="ranking"]/text()')
rank_2022 = row.xpath('.//td/span[@data-v-6c038bb7=""]/text()')
level = row.xpath('.//td[contains(text(), "前")]/text()')
u_name = row.xpath('.//td//span[@class="name-cn"]/text()')
logo = row.xpath('.//td//div[@class="logo"]/img/@src')
grade = row.xpath('.//td[last()]/text()') # 更新 XPath 以提取总分
if not u_name:
continue
data.append({
'2023年排名': rank_2023[0].strip() if rank_2023 else 'N/A',
'2022年排名': rank_2022[0].strip() if rank_2022 else 'N/A',
'全部层次': level[0].strip() if level else 'N/A',
'大学名称': u_name[0].strip() if u_name else 'N/A',
'Logo链接': logo[0].strip() if logo else 'N/A',
'总分': grade[0].strip() if grade else 'N/A'
})
except Exception as e:
logging.error(f"从行中提取数据时出错: {e}")
continue
logging.info("成功提取详情页数据")
return data
# 保存到 CSV 文件
def save_to_csv(data, filename):
with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
fieldnames = ['序号', '一级分类', '二级分类', '2023年排名', '2022年排名', '全部层次', '大学名称', 'Logo链接', '总分']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 如果文件为空则写入头部
csvfile.seek(0, 2)
if csvfile.tell() == 0:
writer.writeheader()
for index, row in enumerate(data, start=1):
row['序号'] = index
writer.writerow(row)
logging.info(f"数据已成功保存到 {filename}")
def fetch_and_process_subject(subject, queue):
sub_title = subject['sub_full_title']
sub_url = subject['sub_url']
cip_full_title = subject['cip_full_title'] # 获取一级分类
logging.info(f"正在从 {sub_title} 抓取数据")
session = requests.Session()
num_retries = 3
while num_retries > 0:
try:
response = session.get(sub_url, timeout=10)
response.raise_for_status()
details_html_content = response.content
# 提取数据
university_data = extract_data_from_details_page(details_html_content)
for data in university_data:
data['一级分类'] = cip_full_title # 添加一级分类
data['二级分类'] = sub_title # 添加二级分类
queue.put(university_data)
logging.info(f"成功从 {sub_title} 抓取数据")
return # 如果成功,返回
except requests.RequestException as e:
logging.error(f"获取详情页面时出错: {sub_url}: {e}")
num_retries -= 1
time.sleep(5) # 等待5秒后重试
logging.error(f"从 {sub_url} 抓取失败")
# 线程类
class FetchSubjectThread(threading.Thread):
def __init__(self, subject, queue):
threading.Thread.__init__(self)
self.subject = subject
self.queue = queue
def run(self):
fetch_and_process_subject(self.subject, self.queue)
# 主抓取逻辑
def main():
subjects = fetch_subjects_list()
# 用户输入一级目录名称
cip_title_filter = input("请输入一级目录的名称: ")
filtered_subjects = [subject for subject in subjects if cip_title_filter in subject['cip_full_title']]
if not filtered_subjects:
logging.error("没有找到匹配的一级目录,请检查输入的名称。")
return
queue = Queue()
threads = []
for subject in filtered_subjects:
thread = FetchSubjectThread(subject, queue)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
while not queue.empty():
data = queue.get()
filename = f"{data[0]['一级分类']}.csv" # 确保每个数据项都有一级分类
save_to_csv(data, filename)
if __name__ == "__main__":
main()
爬虫开发中遵守网站规则和法律法规是很重要的。希望这篇文章能对爬虫开发感兴趣的读者有所帮助。