从零搭建本地pypi镜像源2:下载清华pypi镜像源

        在博客从零搭建本地pypi镜像源1:快速体验_wzg2016的博客-CSDN博客中,快速体验了pypi镜像源的搭建过程。本博客在前一个博客的基础之上,着重介绍如何下载整个清华pypi镜像源。

1. 轮子函数--爬虫函数

        我写了几个小函数,从清华pypi镜像源中下载所有的pypi 包。

  util_spider.py

import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv 
import os

class Class_Util_Spider():
    def __init__(self):
        pass

    # 获取子链接
    def get_child_links(self,base_url):
        """
        功能:获取url界面上的所有子链接
        输入:
            url:网址
        输出:
            folder_links:列表,每个元素是一个子url地址
        """
        child_links = []
        # 获取网页源码
        html_text = requests.request("GET",base_url).text
        _links_ = re.findall('',html_text)
        for _link_ in _links_:
            link = base_url + "/" + _link_
            child_links.append(link)
        return child_links

    # 获取孙链接
    def get_grandson_links_names(self,url_path_list,text_path):
        """
        功能:解析url界面,获取python包的名称与下载路径
        参数:
            url_path_list:列表,每个元素是一个网页地址,对应于清华镜像的一个python包文件夹
            text_path:txt文件的路径,用于存储python包的名字和下载路径,以逗号分隔

        输出:
            text 文件,每行包含两个元素[name,link],
                -name:python包的名字
                -link:python包的完全下载路径
        """
        file = open(text_path,"w")
        for i,url_path in enumerate(url_path_list):
            print(f"{i}/{len(url_path_list)}",url_path)
            html_text = requests.request("GET",url=url_path).text # 源码解析
            package_links = re.findall('(.*?)',html_text) # 找到文件名

            # 将链接和文件名写入csv文件
            for link,name in zip(package_links,package_names):
                link = "https://pypi.tuna.tsinghua.edu.cn/" + link # 链接拼接完整
                file.write(str(name +",,,"+link+"\n"))# 写入
        file.close()

    # 从txt读取文件,转为列表,列表中的每个元素是一个元素对。
    def read_txt_2_list(self,txt_path):
        assert os.path.exists(txt_path)
        txt_file = open(txt_path,"r")
        data_list = []
        for data in txt_file.readlines():
            data = data.strip("\n")
            data_list.append(data)
        txt_file.close()

        return data_list

    # 下载文件
    def download_file(self,save_dir,save_name,download_address,save_dir_his):
        assert os.path.exists(save_dir)

        save_path = os.path.join(save_dir,save_name)
        save_path_old = os.path.join(save_dir_his,save_name)
        if ((not os.path.exists(save_path)) and (not os.path.exists(save_path_old))):
            save_content = requests.get(download_address).content
            with open(save_path,"wb") as f:
                f.write(save_content)
            print(f"Success downloaded {save_name}")
        else:
            print(f"{save_name} Existed!")

2. 轮子函数--文件夹处理函数

  util_folder.py

import os
import shutil
import glob
import numpy as np

class Class_Util_Folder():
    def __init__(self):
        pass

    def get_all_folders(self,base_dir):
        """获取基文件夹下的所有子文件夹"""
        folder_list = []
        for root,dir_names,file_names in os.walk(base_dir):
            for dir_name in dir_names:
                dir_path = os.path.join(root,dir_name)
                folder_list.append(dir_path)
        return folder_list

    def get_all_files(self,base_dir):
        """获取基文件夹下的所有子文件"""
        file_list = []
        for root,dir_names,file_names in os.walk(base_dir):
            for file_name in file_names:
                file_path = os.path.join(root,file_name)
                file_list.append(file_path)
        return file_list

    def remove_empty_folder(self,folder_path):
        """删除空文件夹"""
        if len(os.listdir(folder_path)) == 0:
            os.rmdir(folder_path)

    def get_file_lists(self,fold_path):
        """
        获取folder_path文件夹下的所有文件及文件名。
        """
        file_path_list = np.sort(glob.glob(fold_path+"/*"))# 获取文件夹下的所有文件
        file_name_list = [os.path.split(file)[-1] for file in file_path_list]#路径分割,获取文件名

        file_path_list = [str(file) for file in file_path_list] # 把文件路径转为string格式
        file_name_list = [str(file) for file in file_name_list] # 把文件名转为string格式
        
        return file_path_list,file_name_list
    
    def get_file_name_list(self,folder_path,txt_path):
        """获取一个文件夹下的所有文件的文件名,并写入text文件中"""
        file_path_list,file_name_list = self.get_file_lists(folder_path)
        self.write_list_2_txt(file_name_list,txt_path)

    def write_list_2_txt(self,data_list,txt_path):
        """把列表中的文本数据写入txt文件中"""
        assert len(data_list)>0
        txt_file = open(txt_path,"w")
        for data in data_list:
            assert isinstance(data,str)
            txt_file.write(str(data+"\n"))
        txt_file.close()

    def read_txt_2_list(self,txt_path):
        """从txt文件中读取文本,为列表"""
        assert os.path.exists(txt_path)
        txt_file = open(txt_path,"r")
        data_list = []
        for data in txt_file.readlines():
            data = data.strip("\n")
            data_list.append(data)
        txt_file.close()

        return data_list

    def move_file(self,src_file,aim_file):
        """移动文件:从src_file移动到aim_file"""
        assert (os.path.exists(src_file))
        assert (src_file.split("/")[-1] == aim_file.split("/")[-1])
        if not os.path.exists(aim_file):
            shutil.move(src_file,aim_file)
        else:
            os.remove(src_file)

    def get_files_from_grandpa_fold(self,base_dir):
        """ 获取所有孙文件夹中的文件 """
        path_list = glob.glob(base_dir+"/*/*") 

        file_path_list,file_name_list = [],[]
        for path in path_list:
            if os.path.isfile(path):
                file_path_list.append(path)
                file_name_list.append(os.path.split(path)[-1])

        return file_path_list,file_name_list

    def print_progress(self,current_num,total_num):
        """        打印当前进度        """
        print(f"{round(current_num/total_num*100,2)}%,{current_num}//{total_num}")

3. 步骤一:获取所有pypi包的下载链接

  step1_get_requirements.py

import os 
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from util.util_spider import Class_Util_Spider

util_spider_obj = Class_Util_Spider()

search_url_base = "https://pypi.tuna.tsinghua.edu.cn/simple"

# 获取requirements.txt
requirements_txt = "./pypi_requirements_all.txt"
if not os.path.exists(requirements_txt):
    # 获取子链接
    child_links = util_spider_obj.get_child_links(search_url_base) 
    # 通过子链接获取孙链接(pypi包的下载链接),写入requirements_txt中
    util_spider_obj.get_grandson_links_names(child_links,requirements_txt)

4. 步骤二:下载pypi包到本地(截至2022-12-12,约需要12T的存储空间)

  step2_download.py 在linux中使用

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from util.util_folder import Class_Util_Folder
from util.util_spider import Class_Util_Spider

util_folder_obj = Class_Util_Folder()
util_spider_obj = Class_Util_Spider()

download_dir_base = "/home/wzg/data2/pypi-packages/"
# # 读取requirements.txt,下载package
error_txt = open("./errors.txt","w") # 用于存储下载失败的package信息
requirements_txt = "./pypi_requirements_all.txt"
required_packages = util_folder_obj.read_txt_2_list(requirements_txt)

for i,package_info in enumerate(required_packages):
    util_folder_obj.print_progress(i,len(required_packages))

    if  (len(package_info.split(",,,"))==2): # 用于校验下载路径是否正确
        name,download_link = package_info.split(",,,")
        if download_link[:15]=="https://pypi.tuna.tsinghua.edu.cn"[:15]:
            util_spider_obj.download_file(
                download_dir_base, # packages的下载路径文件夹
                name, # package的名字
                download_link, # package的下载链接
                save_dir_his=download_dir_base) # packages的下载路径文件夹,用以检查是否该文件是否已经存在了
        
    else:
        error_txt.write(str(package_info+"\n"))
    
 

        如果是在linux中下载pypi包,基本上不会出什么问题,但是如果是在win10环境下执行以上程序的话,可能存在下载的文件的文件名不符合windows的格式的情况,因此需要对文件名做一些格式审查,这时可以用下面的程序下载pypi文件:

step2_download.py 在win10中使用

import os 
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv 
import shutil
import glob
import numpy as np

from util.util_folder import Class_Util_Folder
from util.util_spider import Class_Util_Spider

util_folder_obj = Class_Util_Folder()
util_spider_obj = Class_Util_Spider()

search_url_base = "https://pypi.tuna.tsinghua.edu.cn/simple"
download_url_base = "https://pypi.tuna.tsinghua.edu.cn"

download_dir_base = "./pypi-packages/"

# 获取requirements.txt
# requirements_txt = "./requirements_all.txt"
# if not os.path.exists(requirements_txt):
#     child_links = util_spider_obj.get_child_links(search_url_base)
#     util_spider_obj.get_grandson_links_names(child_links,"requirements_all.txt")

# 获取已经下载的python包列表
# downloaded_txt = "./downloaded.txt"
# if not os.path.exists(downloaded_txt):
#     util_folder_obj.get_file_name_list(download_dir_base,downloaded_txt)

# # 读取requirements.txt,下载package
error_txt = open("./errors.txt","w+")
requirements_txt = "./pypi_requirements_all.txt"
required_packages = util_folder_obj.read_txt_2_list(requirements_txt)

download_dir_base = download_dir_base

# for i,package_info in enumerate(required_packages):
# for i in range(838628, 2000000):
for i in range(0, len(required_packages)):
    package_info = required_packages[i]
    util_folder_obj.print_progress(i,len(required_packages))

    if  (len(package_info.split(",,,"))==2):
        name,download_link = package_info.split(",,,")
        # print("name=",name)
        # print("link=",download_link)
        ## 在win10系统中,文件名中不允许包含以下符号,因此,文件名红包含以下符号的python包不下载。在linux中不存在这种情况。
        if (download_link[:15]=="https://pypi.tuna.tsinghua.edu.cn"[:15]\
             and (":" not in name)\
            and ("*" not in name)\
            and ("?" not in name)\
            and ("<" not in name)\
            and (">" not in name)\
            and ("[" not in name)\
            and ("]" not in name)\
            and ("\\" not in name)\
            and ("/" not in name)\
            and ("|" not in name)):
            util_spider_obj.download_file(download_dir_base,name,download_link,save_dir_his=download_dir_base) # 下载文件
        else:
            error_txt.write(str(package_info + "\n"))
    else:
        error_txt.write(str(package_info+"\n"))
    
 

5. 步骤三:检查下载的pypi文件名是否符合pip2pi的要求(仅仅在win10系统中搭建镜像时使用,在linux系统中不用)。

       pip2pi在执行dir2pi时对python包的文件名有要求(在win10中),不符合该命名格式要求的将会在执行dir2pi时出现错误与中断,这是一个十分费时费力的过程。为了避免该情况,我对文件名格式进行了审查,将不符合pip2pi格式要求的进行了移除处理。 以下脚本参考了pip2pi的源码,

import os 
import re
import glob
import shutil
import pkg_resources

# 获取文件名列表

# 遍历文件名判断文件名格式是否符合格式要求

# 如果不符合格式要求,打印该文件名

class InvalidFilePackageName(ValueError):
    def __init__(self, file, basedir=None):
        msg = "unexpected file name: %r " %(file, )
        msg += "(not in 'pkg-name-version.xxx' format"
        if basedir:
            msg += "; found in directory: %r" %(basedir)
        msg += ")"
        super(InvalidFilePackageName, self).__init__(msg)

def check_package_name(file, basedir=None):
    """ Returns the package name for a given file, or raises an
        ``InvalidFilePackageName`` exception if the file name is
        not valid::

        >>> file_to_package("foo-1.2.3_rc1.tar.gz")
        ('foo', '1.2.3-rc1.tar.gz')
        >>> file_to_package("foo-bar-1.2.tgz")
        ('foo-bar', '1.2.tgz')
        >>> file_to_package("kafka-quixey-0.8.1-1.tar.gz")
        ('kafka-quixey', '0.8.1-1.tar.gz')
        >>> file_to_package("foo-bar-1.2-py27-none-any.whl")
        ('foo-bar', '1.2-py27-none-any.whl')
        >>> file_to_package("Cython-0.17.2-cp26-none-linux_x86_64.whl")
        ('Cython', '0.17.2-cp26-none-linux_x86_64.whl')
        >>> file_to_package("PyYAML-3.10-py2.7-macosx-10.7-x86_64.egg")
        ('PyYAML', '3.10-py2.7-macosx-10.7-x86_64.egg')
        >>> file_to_package("python_ldap-2.3.9-py2.7-macosx-10.3-fat.egg")
        ('python-ldap', '2.3.9-py2.7-macosx-10.3-fat.egg')
        >>> file_to_package("python_ldap-2.4.19-cp27-none-macosx_10_10_x86_64.whl")
        ('python-ldap', '2.4.19-cp27-none-macosx_10_10_x86_64.whl')
        >>> file_to_package("foo.whl")
        Traceback (most recent call last):
            ...
        InvalidFilePackageName: unexpected file name: 'foo.whl' (not in 'pkg-name-version.xxx' format)
        >>> file_to_package("foo.png")
        Traceback (most recent call last):
            ...
        InvalidFilePackageName: unexpected file name: 'foo.png' (not in 'pkg-name-version.xxx' format)
        """
    file = os.path.basename(file)
    file_ext = os.path.splitext(file)[1].lower()
    if file_ext == ".egg":
        dist = pkg_resources.Distribution.from_location(None, file)
        name = dist.project_name
        split = (name, file[len(name)+1:])
        to_safe_name = lambda x: x
        to_safe_rest = lambda x: x
    elif file_ext == ".whl":
        bits = file.rsplit("-", 4)
        split = (bits[0], "-".join(bits[1:]))
        to_safe_name = pkg_resources.safe_name
        to_safe_rest = lambda x: x
    else:
        match = re.search(r"(?P.*?)-(?P\d+.*)", file)
        if not match:
            return False
        split = (match.group("pkg"), match.group("rest"))
        to_safe_name = pkg_resources.safe_name
        to_safe_rest = pkg_resources.safe_name

    if len(split) != 2 or not split[1]:
        return False

    return True

def file_name_match(file_name):
    match = re.search(r"(?P.*?)-(?P\d+.*)", file_name)
    return match

def move_file_2_dir(file_path,dir_path):
    file_name = os.path.split(file_path)[-1]
    move_path = os.path.join(dir_path,file_name)
    shutil.move(file_path,move_path)


if __name__ =="__main__":
    # #### 第一步: 查找名命名不符合要求的文件
    src_dir = "E:\pypi-packages"
    error_dir = "E:\pypi-packages-error"

    file_name_list = glob.glob(src_dir+"/*.*")
    for f_path in file_name_list:
        correct = check_package_name(f_path)

        if not correct:
            move_file_2_dir(f_path,error_dir)
            print(f_path)

    ###### 第二步:尝试修改文件名成标准格式
    # file_name ="aimmo-v-0.1.1-alpha.post.dev1.tar.gz"
    # match = file_name_match(file_name)
    # print(match)

    #### 第三步:string.replace尝试
    # file_name ="aimmo-v0.1.1-alpha.post.dev1.tar.gz"
    # file_name_new = file_name.replace("-v", "-v-")
    # print(file_name_new)

    #### 第四步: 修改包含v的package
    # corrected_dir= "D:\pypi-build-master\pypi-packages-errors"
    # error_dir = "D:\pypi-build-master\pypi-packages-corrected"
    # file_name_list = glob.glob(error_dir+"/*.*")

    # for f_path in file_name_list:
    #     f_name = os.path.split(f_path)[-1]
    #     file_name_new = f_name.replace("-v", "-v-")
    #     file_name_new = f_name.replace("-V", "-v-")
    #     file_name_new = f_name.replace("_V", "-v-")
    #     file_name_new = f_name.replace("_v", "-v-")

    #     if "-v-" in file_name_new:
    #         file_save_path = os.path.join(corrected_dir,file_name_new)
    #         shutil.move(f_path,file_save_path)

    #### 第五步:再次校验命名是否符合要求
    # dir_path= "D:\pypi-build-master\pypi-packages-corrected"
    # error_dir = "D:\pypi-build-master\pypi-packages-errors"

    # file_paths = glob.glob(dir_path+"/*")
    # for f_path in file_paths:
    #     f_name = os.path.split(f_path)[-1]
    #     match = file_name_match(f_name)

    #     if not match:
    #         move_file_2_dir(f_path,error_dir)
    #         print(f_name)

    #### 第六步:把修正后的文件放回pypi库
    # dir_path= "D:\pypi-build-master\pypi-packages-corrected"
    # build_dir = "D:\pypi_mirror_build\pypi-packages"

    # file_paths = glob.glob(dir_path+"/*")
    # for f_path in file_paths:
    #     f_name = os.path.split(f_path)[-1]
    #     match = file_name_match(f_name)

    #     if match:
    #         move_file_2_dir(f_path,build_dir)
    #         print(f_name)
    
    ##第七步:创建dir2pi
    #  


    
 

6. 在win10中安装pypi镜像时的其它注意事项

        如在从零搭建本地pypi镜像源1:快速体验_wzg2016的博客-CSDN博客中介绍的,在执行dir2pi之后,会生成很多index.html文件,也会生成一些其他的文件,但是搭建镜像只需要index.html文件就可以了,通过copy-paste可以达到仅仅保留index.html文件的目的,因为剩余的文件不符合win10的格式要求。

        另外,建议把simple文件夹重命名,比如重命名为"simple-wzg16",避免simple文件夹与已有文件夹重名,在执行nginx.exe时出现误判。

7. 获取requirements_all.txt更新

        代码被不小心删除了,这里只说一下当时的执行思路。

        首先:保存上一次获取的 requirements_all.txt

        然后,获取更新后的 requirements_all.txt

        最后,把两个txt文件读取为两个list后,转为set()格式,执行相减,得到最近更新的python包,把这些包保存即可。

8. 为什么选择了pip2pi方式,而把不是pypi-server方式?

        原因:pip2pi方式可以生成本地文件,每次开机后用nginx可以快速启动。pypi-server的使用也十分方便,但是如果每次开机都要重新启动到可以执行使用的程度,需要的时间比较长,因此我使用pip2pi的方式。

9. 更新pypi包后,如何快速更新index.html列表?待定。

你可能感兴趣的:(镜像搭建,python)