USVN获取所有项目及项目信息-python爬虫

# coding:utf-8
import re,csv
import requests

# 点击管理-项目-上方的网址,这个url需要改变
projects_url='http://svnpub.xurikeji.com:8081/usvn1/admin/project'
project_url=projects_url.split("admin")[0]+"project/"

# 设置请求头信息,这里的cookie应该需要改变
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
    'Cookie':'_ga=GA1.1.1852336263.1700719769; _ga_0C4M1PWYZ7=GS1.1.1700719769.1.0.1700719810.0.0.0; _ga_T11SF3WXX2=GS1.1.1700719790.1.0.1700719810.40.0.0; _ga_K2SPJK2C73=GS1.1.1700719790.1.0.1700719810.40.0.0; PHPSESSID=ipqe60ge4pjnfbqdahe0nj2oh2'
}
# 输出到最终的文件,可以不改变
file = open('C:\\Users\\xurikeji\\Desktop\\my_list.csv', 'a', newline='')

# 爬取所有项目,获取项目名和项目描述
def project():
    # 发送GET请求
    response = requests.get(projects_url, headers=headers)
    # 获取网页源码
    html = response.text
    # 使用正则表达式匹配项目名称和项目描述
    project_name_desc_re = re.compile(r'(?P.*?).*?(?P.*?)',re.S)
    project_names_descs = project_name_desc_re.finditer(html)

    # 传递项目名和项目描述到info函数,获取项目的信息
    for project_name_desc in project_names_descs:
        info(project_name_desc.group('name'),project_name_desc.group('describe'))

    # 最后关闭文件
    file.close()

# 获取项目的完整信息
def info(project_name,project_describe):
    # 拼接项目的url地址
    url=project_url+project_name
    # 发送get请求
    response = requests.get(url, headers=headers)
    # 如果是底层已经被删除的项目则退出
    if response.status_code == 500:
        print("GET请求返回状态码为500,跳出函数")
        return
    # 获取网页源码
    html = response.text
    # 正则表达式获取项目的所有管理员,用户组,最近的提交信息
    project_admins_re = re.compile(r'您确定要取消用户 (?P.*?) 对于项目 .*? 的管理权\?',re.S)
    project_users_re = re.compile(r'您确定要将用户组 (?P.*?) 从项目 .*? 中删除\?', re.S)
    recently_commit_info_re = re.compile(r'(?P.*?).*?(?P.*?).*?(?P.*?).*?(?P.*?)', re.S)
    # 应用正则表达式到网页源码
    project_admins = project_admins_re.finditer(html)
    project_users = project_users_re.finditer(html)
    recently_commit_info = recently_commit_info_re.findall(html)
    # 初始化管理员和用户组列表
    admins=[]
    users=[]
    # 将管理员和用户组加到对应的列表中
    for project_admin in project_admins:
        admins.append(project_admin.group('admin'))
    for project_user in project_users:
        users.append(project_user.group('user'))
    # 列表转换为字符串,使用逗号分隔
    admins = ','.join(admins)
    users = ','.join(users)
    if len(recently_commit_info) > 0 and len(recently_commit_info[0]) > 0:
        recently_commit_infos = ','.join(recently_commit_info[0])
    else:
        recently_commit_infos = '无'
    # 输出的信息
    print("项目名:",project_name)
    print("项目描述:",project_describe)
    print("项目管理员:",admins)
    print("项目用户组:",users)
    print("信息:",recently_commit_infos)
    print("======================================================")
    # 写到指定csv文件
    writer = csv.writer(file)
    writer.writerow([project_name,project_describe,admins,users,recently_commit_infos])
    # 重新初始化管理员和用户组列表
    admins=[]
    users=[]

if __name__ == '__main__':
    project()

你可能感兴趣的:(python,爬虫,开发语言)