在博客从零搭建本地pypi镜像源1:快速体验_wzg2016的博客-CSDN博客中,快速体验了pypi镜像源的搭建过程。本博客在前一个博客的基础之上,着重介绍如何下载整个清华pypi镜像源。
我写了几个小函数,从清华pypi镜像源中下载所有的pypi 包。
util_spider.py
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os
class Class_Util_Spider():
def __init__(self):
pass
# 获取子链接
def get_child_links(self,base_url):
"""
功能:获取url界面上的所有子链接
输入:
url:网址
输出:
folder_links:列表,每个元素是一个子url地址
"""
child_links = []
# 获取网页源码
html_text = requests.request("GET",base_url).text
_links_ = re.findall('',html_text)
for _link_ in _links_:
link = base_url + "/" + _link_
child_links.append(link)
return child_links
# 获取孙链接
def get_grandson_links_names(self,url_path_list,text_path):
"""
功能:解析url界面,获取python包的名称与下载路径
参数:
url_path_list:列表,每个元素是一个网页地址,对应于清华镜像的一个python包文件夹
text_path:txt文件的路径,用于存储python包的名字和下载路径,以逗号分隔
输出:
text 文件,每行包含两个元素[name,link],
-name:python包的名字
-link:python包的完全下载路径
"""
file = open(text_path,"w")
for i,url_path in enumerate(url_path_list):
print(f"{i}/{len(url_path_list)}",url_path)
html_text = requests.request("GET",url=url_path).text # 源码解析
package_links = re.findall('(.*?)',html_text) # 找到文件名
# 将链接和文件名写入csv文件
for link,name in zip(package_links,package_names):
link = "https://pypi.tuna.tsinghua.edu.cn/" + link # 链接拼接完整
file.write(str(name +",,,"+link+"\n"))# 写入
file.close()
# 从txt读取文件,转为列表,列表中的每个元素是一个元素对。
def read_txt_2_list(self,txt_path):
assert os.path.exists(txt_path)
txt_file = open(txt_path,"r")
data_list = []
for data in txt_file.readlines():
data = data.strip("\n")
data_list.append(data)
txt_file.close()
return data_list
# 下载文件
def download_file(self,save_dir,save_name,download_address,save_dir_his):
assert os.path.exists(save_dir)
save_path = os.path.join(save_dir,save_name)
save_path_old = os.path.join(save_dir_his,save_name)
if ((not os.path.exists(save_path)) and (not os.path.exists(save_path_old))):
save_content = requests.get(download_address).content
with open(save_path,"wb") as f:
f.write(save_content)
print(f"Success downloaded {save_name}")
else:
print(f"{save_name} Existed!")
util_folder.py
import os
import shutil
import glob
import numpy as np
class Class_Util_Folder():
def __init__(self):
pass
def get_all_folders(self,base_dir):
"""获取基文件夹下的所有子文件夹"""
folder_list = []
for root,dir_names,file_names in os.walk(base_dir):
for dir_name in dir_names:
dir_path = os.path.join(root,dir_name)
folder_list.append(dir_path)
return folder_list
def get_all_files(self,base_dir):
"""获取基文件夹下的所有子文件"""
file_list = []
for root,dir_names,file_names in os.walk(base_dir):
for file_name in file_names:
file_path = os.path.join(root,file_name)
file_list.append(file_path)
return file_list
def remove_empty_folder(self,folder_path):
"""删除空文件夹"""
if len(os.listdir(folder_path)) == 0:
os.rmdir(folder_path)
def get_file_lists(self,fold_path):
"""
获取folder_path文件夹下的所有文件及文件名。
"""
file_path_list = np.sort(glob.glob(fold_path+"/*"))# 获取文件夹下的所有文件
file_name_list = [os.path.split(file)[-1] for file in file_path_list]#路径分割,获取文件名
file_path_list = [str(file) for file in file_path_list] # 把文件路径转为string格式
file_name_list = [str(file) for file in file_name_list] # 把文件名转为string格式
return file_path_list,file_name_list
def get_file_name_list(self,folder_path,txt_path):
"""获取一个文件夹下的所有文件的文件名,并写入text文件中"""
file_path_list,file_name_list = self.get_file_lists(folder_path)
self.write_list_2_txt(file_name_list,txt_path)
def write_list_2_txt(self,data_list,txt_path):
"""把列表中的文本数据写入txt文件中"""
assert len(data_list)>0
txt_file = open(txt_path,"w")
for data in data_list:
assert isinstance(data,str)
txt_file.write(str(data+"\n"))
txt_file.close()
def read_txt_2_list(self,txt_path):
"""从txt文件中读取文本,为列表"""
assert os.path.exists(txt_path)
txt_file = open(txt_path,"r")
data_list = []
for data in txt_file.readlines():
data = data.strip("\n")
data_list.append(data)
txt_file.close()
return data_list
def move_file(self,src_file,aim_file):
"""移动文件:从src_file移动到aim_file"""
assert (os.path.exists(src_file))
assert (src_file.split("/")[-1] == aim_file.split("/")[-1])
if not os.path.exists(aim_file):
shutil.move(src_file,aim_file)
else:
os.remove(src_file)
def get_files_from_grandpa_fold(self,base_dir):
""" 获取所有孙文件夹中的文件 """
path_list = glob.glob(base_dir+"/*/*")
file_path_list,file_name_list = [],[]
for path in path_list:
if os.path.isfile(path):
file_path_list.append(path)
file_name_list.append(os.path.split(path)[-1])
return file_path_list,file_name_list
def print_progress(self,current_num,total_num):
""" 打印当前进度 """
print(f"{round(current_num/total_num*100,2)}%,{current_num}//{total_num}")
step1_get_requirements.py
import os
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from util.util_spider import Class_Util_Spider
util_spider_obj = Class_Util_Spider()
search_url_base = "https://pypi.tuna.tsinghua.edu.cn/simple"
# 获取requirements.txt
requirements_txt = "./pypi_requirements_all.txt"
if not os.path.exists(requirements_txt):
# 获取子链接
child_links = util_spider_obj.get_child_links(search_url_base)
# 通过子链接获取孙链接(pypi包的下载链接),写入requirements_txt中
util_spider_obj.get_grandson_links_names(child_links,requirements_txt)
step2_download.py 在linux中使用
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from util.util_folder import Class_Util_Folder
from util.util_spider import Class_Util_Spider
util_folder_obj = Class_Util_Folder()
util_spider_obj = Class_Util_Spider()
download_dir_base = "/home/wzg/data2/pypi-packages/"
# # 读取requirements.txt,下载package
error_txt = open("./errors.txt","w") # 用于存储下载失败的package信息
requirements_txt = "./pypi_requirements_all.txt"
required_packages = util_folder_obj.read_txt_2_list(requirements_txt)
for i,package_info in enumerate(required_packages):
util_folder_obj.print_progress(i,len(required_packages))
if (len(package_info.split(",,,"))==2): # 用于校验下载路径是否正确
name,download_link = package_info.split(",,,")
if download_link[:15]=="https://pypi.tuna.tsinghua.edu.cn"[:15]:
util_spider_obj.download_file(
download_dir_base, # packages的下载路径文件夹
name, # package的名字
download_link, # package的下载链接
save_dir_his=download_dir_base) # packages的下载路径文件夹,用以检查是否该文件是否已经存在了
else:
error_txt.write(str(package_info+"\n"))
如果是在linux中下载pypi包,基本上不会出什么问题,但是如果是在win10环境下执行以上程序的话,可能存在下载的文件的文件名不符合windows的格式的情况,因此需要对文件名做一些格式审查,这时可以用下面的程序下载pypi文件:
step2_download.py 在win10中使用
import os
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import csv
import shutil
import glob
import numpy as np
from util.util_folder import Class_Util_Folder
from util.util_spider import Class_Util_Spider
util_folder_obj = Class_Util_Folder()
util_spider_obj = Class_Util_Spider()
search_url_base = "https://pypi.tuna.tsinghua.edu.cn/simple"
download_url_base = "https://pypi.tuna.tsinghua.edu.cn"
download_dir_base = "./pypi-packages/"
# 获取requirements.txt
# requirements_txt = "./requirements_all.txt"
# if not os.path.exists(requirements_txt):
# child_links = util_spider_obj.get_child_links(search_url_base)
# util_spider_obj.get_grandson_links_names(child_links,"requirements_all.txt")
# 获取已经下载的python包列表
# downloaded_txt = "./downloaded.txt"
# if not os.path.exists(downloaded_txt):
# util_folder_obj.get_file_name_list(download_dir_base,downloaded_txt)
# # 读取requirements.txt,下载package
error_txt = open("./errors.txt","w+")
requirements_txt = "./pypi_requirements_all.txt"
required_packages = util_folder_obj.read_txt_2_list(requirements_txt)
download_dir_base = download_dir_base
# for i,package_info in enumerate(required_packages):
# for i in range(838628, 2000000):
for i in range(0, len(required_packages)):
package_info = required_packages[i]
util_folder_obj.print_progress(i,len(required_packages))
if (len(package_info.split(",,,"))==2):
name,download_link = package_info.split(",,,")
# print("name=",name)
# print("link=",download_link)
## 在win10系统中,文件名中不允许包含以下符号,因此,文件名红包含以下符号的python包不下载。在linux中不存在这种情况。
if (download_link[:15]=="https://pypi.tuna.tsinghua.edu.cn"[:15]\
and (":" not in name)\
and ("*" not in name)\
and ("?" not in name)\
and ("<" not in name)\
and (">" not in name)\
and ("[" not in name)\
and ("]" not in name)\
and ("\\" not in name)\
and ("/" not in name)\
and ("|" not in name)):
util_spider_obj.download_file(download_dir_base,name,download_link,save_dir_his=download_dir_base) # 下载文件
else:
error_txt.write(str(package_info + "\n"))
else:
error_txt.write(str(package_info+"\n"))
pip2pi在执行dir2pi时对python包的文件名有要求(在win10中),不符合该命名格式要求的将会在执行dir2pi时出现错误与中断,这是一个十分费时费力的过程。为了避免该情况,我对文件名格式进行了审查,将不符合pip2pi格式要求的进行了移除处理。 以下脚本参考了pip2pi的源码,
import os
import re
import glob
import shutil
import pkg_resources
# 获取文件名列表
# 遍历文件名判断文件名格式是否符合格式要求
# 如果不符合格式要求,打印该文件名
class InvalidFilePackageName(ValueError):
def __init__(self, file, basedir=None):
msg = "unexpected file name: %r " %(file, )
msg += "(not in 'pkg-name-version.xxx' format"
if basedir:
msg += "; found in directory: %r" %(basedir)
msg += ")"
super(InvalidFilePackageName, self).__init__(msg)
def check_package_name(file, basedir=None):
""" Returns the package name for a given file, or raises an
``InvalidFilePackageName`` exception if the file name is
not valid::
>>> file_to_package("foo-1.2.3_rc1.tar.gz")
('foo', '1.2.3-rc1.tar.gz')
>>> file_to_package("foo-bar-1.2.tgz")
('foo-bar', '1.2.tgz')
>>> file_to_package("kafka-quixey-0.8.1-1.tar.gz")
('kafka-quixey', '0.8.1-1.tar.gz')
>>> file_to_package("foo-bar-1.2-py27-none-any.whl")
('foo-bar', '1.2-py27-none-any.whl')
>>> file_to_package("Cython-0.17.2-cp26-none-linux_x86_64.whl")
('Cython', '0.17.2-cp26-none-linux_x86_64.whl')
>>> file_to_package("PyYAML-3.10-py2.7-macosx-10.7-x86_64.egg")
('PyYAML', '3.10-py2.7-macosx-10.7-x86_64.egg')
>>> file_to_package("python_ldap-2.3.9-py2.7-macosx-10.3-fat.egg")
('python-ldap', '2.3.9-py2.7-macosx-10.3-fat.egg')
>>> file_to_package("python_ldap-2.4.19-cp27-none-macosx_10_10_x86_64.whl")
('python-ldap', '2.4.19-cp27-none-macosx_10_10_x86_64.whl')
>>> file_to_package("foo.whl")
Traceback (most recent call last):
...
InvalidFilePackageName: unexpected file name: 'foo.whl' (not in 'pkg-name-version.xxx' format)
>>> file_to_package("foo.png")
Traceback (most recent call last):
...
InvalidFilePackageName: unexpected file name: 'foo.png' (not in 'pkg-name-version.xxx' format)
"""
file = os.path.basename(file)
file_ext = os.path.splitext(file)[1].lower()
if file_ext == ".egg":
dist = pkg_resources.Distribution.from_location(None, file)
name = dist.project_name
split = (name, file[len(name)+1:])
to_safe_name = lambda x: x
to_safe_rest = lambda x: x
elif file_ext == ".whl":
bits = file.rsplit("-", 4)
split = (bits[0], "-".join(bits[1:]))
to_safe_name = pkg_resources.safe_name
to_safe_rest = lambda x: x
else:
match = re.search(r"(?P.*?)-(?P\d+.*)", file)
if not match:
return False
split = (match.group("pkg"), match.group("rest"))
to_safe_name = pkg_resources.safe_name
to_safe_rest = pkg_resources.safe_name
if len(split) != 2 or not split[1]:
return False
return True
def file_name_match(file_name):
match = re.search(r"(?P.*?)-(?P\d+.*)", file_name)
return match
def move_file_2_dir(file_path,dir_path):
file_name = os.path.split(file_path)[-1]
move_path = os.path.join(dir_path,file_name)
shutil.move(file_path,move_path)
if __name__ =="__main__":
# #### 第一步: 查找名命名不符合要求的文件
src_dir = "E:\pypi-packages"
error_dir = "E:\pypi-packages-error"
file_name_list = glob.glob(src_dir+"/*.*")
for f_path in file_name_list:
correct = check_package_name(f_path)
if not correct:
move_file_2_dir(f_path,error_dir)
print(f_path)
###### 第二步:尝试修改文件名成标准格式
# file_name ="aimmo-v-0.1.1-alpha.post.dev1.tar.gz"
# match = file_name_match(file_name)
# print(match)
#### 第三步:string.replace尝试
# file_name ="aimmo-v0.1.1-alpha.post.dev1.tar.gz"
# file_name_new = file_name.replace("-v", "-v-")
# print(file_name_new)
#### 第四步: 修改包含v的package
# corrected_dir= "D:\pypi-build-master\pypi-packages-errors"
# error_dir = "D:\pypi-build-master\pypi-packages-corrected"
# file_name_list = glob.glob(error_dir+"/*.*")
# for f_path in file_name_list:
# f_name = os.path.split(f_path)[-1]
# file_name_new = f_name.replace("-v", "-v-")
# file_name_new = f_name.replace("-V", "-v-")
# file_name_new = f_name.replace("_V", "-v-")
# file_name_new = f_name.replace("_v", "-v-")
# if "-v-" in file_name_new:
# file_save_path = os.path.join(corrected_dir,file_name_new)
# shutil.move(f_path,file_save_path)
#### 第五步:再次校验命名是否符合要求
# dir_path= "D:\pypi-build-master\pypi-packages-corrected"
# error_dir = "D:\pypi-build-master\pypi-packages-errors"
# file_paths = glob.glob(dir_path+"/*")
# for f_path in file_paths:
# f_name = os.path.split(f_path)[-1]
# match = file_name_match(f_name)
# if not match:
# move_file_2_dir(f_path,error_dir)
# print(f_name)
#### 第六步:把修正后的文件放回pypi库
# dir_path= "D:\pypi-build-master\pypi-packages-corrected"
# build_dir = "D:\pypi_mirror_build\pypi-packages"
# file_paths = glob.glob(dir_path+"/*")
# for f_path in file_paths:
# f_name = os.path.split(f_path)[-1]
# match = file_name_match(f_name)
# if match:
# move_file_2_dir(f_path,build_dir)
# print(f_name)
##第七步:创建dir2pi
#
如在从零搭建本地pypi镜像源1:快速体验_wzg2016的博客-CSDN博客中介绍的,在执行dir2pi之后,会生成很多index.html文件,也会生成一些其他的文件,但是搭建镜像只需要index.html文件就可以了,通过copy-paste可以达到仅仅保留index.html文件的目的,因为剩余的文件不符合win10的格式要求。
另外,建议把simple文件夹重命名,比如重命名为"simple-wzg16",避免simple文件夹与已有文件夹重名,在执行nginx.exe时出现误判。
代码被不小心删除了,这里只说一下当时的执行思路。
首先:保存上一次获取的 requirements_all.txt
然后,获取更新后的 requirements_all.txt
最后,把两个txt文件读取为两个list后,转为set()格式,执行相减,得到最近更新的python包,把这些包保存即可。
原因:pip2pi方式可以生成本地文件,每次开机后用nginx可以快速启动。pypi-server的使用也十分方便,但是如果每次开机都要重新启动到可以执行使用的程度,需要的时间比较长,因此我使用pip2pi的方式。