Python
小记
- 字符串
- string[s_idx : e_idx],s_idx为负且e_idx为正,则取出空子串
断点
import pdb; pdb.set_trace()
随机
import random
random.seed(0)
def random_split_data(data_list, ratio):
val_nums = len(data_list)
offset = int(val_nums * ratio)
if val_nums == 0 or offset < 1:
return [], data_list
random.shuffle(data_list)
sublist_1 = data_list[offset:]
sublist_2 = data_list[:offset]
return sublist_1, sublist_2
url_train, url_valid = data_split(list(url_set), ratio=0.1, shuffle=True)
时间
print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
跳出双层循环
- for else语法:必须在for循环里面添加break,否则会正常执行完所有流程,导致意外的结果
s = [11, 2, 3]
for i in s:
if i < 5:
print(1)
else:
print("s")
JSON保存文件、JSON保存中文
Conda
异常处理
while True :
try:
num1 = int(input('请输入一个数字:'))
num2 = int(input('请输入一个数字:'))
division = num1 / num2
except (ZeroDivisionError, ValueError) as e:
if isinstance(e, ZeroDivisionError):
print('程序出现了除以零错误')
elif isinstance(e, ValueError):
print('程序输入类型错误')
else:
print('两个数字相除等于{}'.format(num1 / num2))
break
打开文件与写入文件
import sys
import json
import math
import re
from tqdm import tqdm
j_ls = []
prefix_path = '/mnt/data_hub/raw_data/'
open_file = prefix_path + sys.argv[1]
save_file = prefix_path + 'processed_data/' + sys.argv[2]
for line in tqdm(open(open_file, 'r')):
j_ls.append(json.loads(line))
def split_text(text):
text = text.strip('。.!!??')
text = text.replace('...', '')
text = text.replace(' ', '')
pattern = '(。|!|\!|\.|?|\?)'
text_splits = re.split(pattern, text)
text_splits = [t for t in text_splits if t]
text_splits_droplast = text_splits[:-1]
sentences = text_splits_droplast[::2]
punctuations = text_splits_droplast[1::2]
if not text_splits or not sentences or not punctuations:
return []
ls = []
for i, j in zip(sentences, punctuations):
ls.append(i + j)
if text_splits[-1] != '':
ls.append(text_splits[-1] + '。')
return ls
def process_text(j):
ls = []
text_ls = split_text(j['text'][0])
text_cnt = len(text_ls)
prompt = '根据上下文情景,尽可能真实地、详细地补全下面一段文本的后续部分。\n文本:'
p = 0.3
split_num = text_cnt - math.ceil(text_cnt * p)
input_text = prompt + text_ls[0]
for text in text_ls[1 : split_num]:
input_text += text
output_text = text_ls[split_num]
if split_num + 1 < text_cnt:
for text in text_ls[split_num + 1 : ]:
output_text += text
ls.append(
{
'instruction': input_text,
'response': output_text,
'data_source': j['data_source'],
'model_arch': j['model_arch'],
'id': j['id']
}
)
return ls
i = 0
ls = []
with open(save_file, 'w') as f:
for json_data in tqdm(j_ls, desc="formatting.."):
text_cnt = len(json_data['text'])
ls.append(text_cnt)
text = json_data['text'][0]
if not text.strip('。.!!??') or split_text(text) == []:
continue
for sample in process_text(json_data):
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
print('num of text_cnt == 1 is ', i)
print('min/max len of ls', min(ls), max(ls))
Pandas
实现 SQL 中的 row_number 排序(分组排序、递增排序)
- pandas 实现 sql 中的row_number,dense_rank,rank
Jupyter Notebook
查看正在run的每个notebook
jupyter notebook list
:可以查看正在run的每个notebook,可以查询其token