**pandas读取文件填写绝对路径,相对路径可能出错读不了
,一直安装错误,所以使用了pyarrow
pip install pyarrow
参考:https://arrow.apache.org/docs/python/
import glob
import pandas as pd
import pyarrow.parquet as pq
aaaaa = glob.glob(r'C:\Users\lo理\oss数据\*')
kkk = []
for i in aaaaa:
print(i.replace("C:",""))
pf = pq.read_table(i.replace("C:",""))
df1 = pf.to_pandas()
kkk.append(df1)
m = kkk[0]
for j in range(1,12):
m = pd.concat([m, kkk[j]])
df = pd.read_parquet(p, engine="pyarrow")
k12 = pd.read_parquet(r"part-***nappy.parquet")
pandas 读取parquet的引擎:
pd.read_parquet(p,engine="pyarrow")
pd.read_parquet(p,engine="fastparquet")
建议使用pyarrow,以为用fastparquet有经历过列表内容无法读取显示None
==pandas ImportError: Missing optional dependency ‘pyarrow’ ==
缺乏pyarrow相关依赖,用conda install pyarrow 解决的
相比csv保存格式,读取时间大大减少
参考:https://blog.csdn.net/qq_23981335/article/details/117994811
参考:https://blog.csdn.net/abcd1f2/article/details/53322934
多线程参考:http://www.bokeren.cc/post-190.html
import requests
r = requests.get("https://i0.hdslb.com/bfs/album/1eab364136f7dc024eac1d663bb843c43c996798.jpg", stream=True)
f = open(r"D:\用户点击日志\img2.jpg", "wb")
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
进度条:
fname保存文件名
from tqdm import tqdm
import requests
def download_file(url: str, fname: str, chunk_size=1024):
"""Helper function to download a file from a given url"""
resp = requests.get(url, stream=True)
total = int(resp.headers.get("content-length", 0))
with open(fname, "wb") as file, tqdm(
desc=fname,
total=total,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as bar:
for data in resp.iter_content(chunk_size=chunk_size):
size = file.write(data)
bar.update(size)
data_url = "https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz"
download_file(data_url, "TinyStories_all_data.tar.gz")
shlex.split 会忽略单双引号;?P的意思就是命名一个名字为value的组,匹配规则符合后面的.+
import os
import re
import shlex
import subprocess
def wget_fetch(download_url, file_path):
"""调用wget下载数据"""
file_name = re.search(r"/parquet/(?P.+)\?", download_url).group("filename")
save_path = os.path.join(file_path, file_name)
print(save_path)
cmd = f'wget --tries=3 --timeout=60 --output-document="{save_path}" "{download_url}"'
cmd_list2 = shlex.split(cmd)
# from python lib manual
# Run the command described by args. Wait for command to complete, then return a CompletedProcess instance.
cp = subprocess.run(cmd_list2)
if cp.returncode != 0:
print(f'Download fail; url:{download_url}')
return None
# 实际上只会下载一个文件
print(f"Download success; file: {save_path}")
return save_path