主要是对工作中使用python进行数据处理过程遇到的一些知识点的整理。如有不足,请指正~
def string_reversel(text ='abc'):
return text[::-1]
d = table.to_dict(outtype ='dict')
最后输出d嵌套字典
我们将表的索引替换成表名这一列,得到的表table[‘表描述’]
table.index = table['表名']
dd = table[['表描述']]
然后进行格式转换为字典格式:
d = dd.to_dict(outtype = 'dict');
ddd = d['表描述']
table_filed['d'] = table_filed['表名称'].map(ddd) #使用map(映射字典)创建新变量
df.select_dtypes(include=['int64']).describe().T.assign(missing_pct=df.apply(lambda x:(len(x)-x.count())/len(x)))
from datetime import datetime
def time_date_func(time):
time = time.replace('年','-')
time = time.replace('月','-')
time = time.replace('日','')
time = pd.to_datetime(time)
return time
tmp_jg["date"] =[time_date_func(i) for i in tmp_jg['date'].values]
tmp_jg
def cosine_sparse_sim(a, b):
"""
sparse sim with a == [(1, 0.45), (2, 0.56)]
"""
assert isinstance(a, list) and isinstance(b, list), "eg: [(1, 0.45), (2, 0.56)]"
a_dict, b_dict = {}, {}
for va in a:
assert isinstance(va, tuple), "eg: (1, 0.45)"
a_dict[va[0]] = va[1]
for vb in b:
assert isinstance(vb, tuple), "eg: (1, 0.45)"
b_dict[vb[0]] = vb[1]
a_k_v, b_k_v = [], []
for ak in a_dict:
if ak in b_dict:
a_k_v.append(a_dict[ak])
b_k_v.append(b_dict[ak])
a_k_v, b_k_v = np.array(a_k_v), np.array(b_k_v)
a_v = np.array(np.array(list(a_dict.values())))
b_v = np.array(np.array(list(b_dict.values())))
return np.sum(a_k_v * b_k_v)/ (np.sqrt(np.sum(a_v ** 2)) * np.sqrt(np.sum(b_v ** 2)))
def cos(vector1,vector2):
dot_product = 0.0
normA = 0.0
normB = 0.0
for a,b in zip(vector1,vector2):
dot_product += a*b
normA += a**2
normB += b**2
if normA == 0.0 or normB==0.0:
return None
else:
return dot_product / ((normA*normB)**0.5)
df['hotel_info_bz'] = df['hotel_info_bz'].astype(str)
strip_func = lambda x: [v.strip() for v in x]
a=[]
for v in df['hotel_info_bz']:
a.append(strip_func(v.split(',')))
a=df['hotel_info_bz'][0:1].values[0]
b = df['hotel_info_bz'][1:2].values[0]
for v in a:
if v in b:
print(v)
##os.path.exists(path) 判断文件路径是否存在
dir = "../cache/dh_ow_data/"
if os.path.exists(dir) :
print("文件路径存在")
else :
print("文件路径不存在")
##os.path.isfile(path) 判断path是否是文件
dir = "../cache/dh_ow_data/000000_0"
if os.path.isfile(dir) :
print ("文件存在")
else :
print ("文件不存在")
##os.path.getsize(path) 获取path文件的大小
size = os.path.getsize(dir)
print (dir,'文件大小:',size/1024)
##os.path.walk(path) 遍历path
os.path.walk(path) 遍历path,返回一个三元组(dirpath, dirnames, filenames). dirpath表示遍历到的路径, dirnames表示该路径下的子目录名,是一个列表, filesnames表示该路径下的文件名,也是一个列表. 例如: 当遍历到c:\windows时,dirpath="c:\windows", dirnames是这个路径下所有子目录名的列表,dirnames是这个路径下所有文件名的列表
python实现文件夹遍历参考链接