Python 使用小记

Python

小记

  1. 字符串
    • string[s_idx : e_idx],s_idx为负且e_idx为正,则取出空子串

断点

import pdb; pdb.set_trace()

随机

  • 列表元素随机分为两部分
import random

random.seed(0)
 
def random_split_data(data_list, ratio):
    val_nums = len(data_list)
    offset = int(val_nums * ratio)
    
    if val_nums == 0 or offset < 1:
        return [], data_list
    
    random.shuffle(data_list)
    sublist_1 = data_list[offset:] # train
    sublist_2 = data_list[:offset] # valid
    
    return sublist_1, sublist_2

url_train, url_valid = data_split(list(url_set), ratio=0.1, shuffle=True)

时间

print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) 

跳出双层循环

  • for else语法:必须在for循环里面添加break,否则会正常执行完所有流程,导致意外的结果
s = [11, 2, 3]

for i in s:
    if i < 5:
        print(1)
else:
    print("s") 

JSON保存文件、JSON保存中文

  • JSON保存中文

Conda

  • conda-pack:conda环境打包

异常处理

while True :
    try: 
    	# 尝试执行的code
    	num1 = int(input('请输入一个数字:'))
        num2 = int(input('请输入一个数字:'))
        division = num1 / num2
    except (ZeroDivisionError, ValueError) as e:
    	# 异常处理逻辑
        if isinstance(e, ZeroDivisionError):  # 判断e属于什么类的实例
            print('程序出现了除以零错误')
        elif isinstance(e, ValueError):
            print('程序输入类型错误')
    else: 
    	# 未产生错误则执行
        print('两个数字相除等于{}'.format(num1 / num2))
        break # 在function里面也可以用return直接返回值

打开文件与写入文件

  • 一个示例
import sys
import json
import math
import re
from tqdm import tqdm

j_ls = []

prefix_path = '/mnt/data_hub/raw_data/'
open_file = prefix_path + sys.argv[1]
save_file = prefix_path + 'processed_data/' + sys.argv[2]

for line in tqdm(open(open_file, 'r')):
    j_ls.append(json.loads(line))

def split_text(text):
    text = text.strip('。.!!??')
    text = text.replace('...', '')
    text = text.replace(' ', '')

    pattern = '(。|!|\!|\.|?|\?)'
    text_splits = re.split(pattern, text)
    text_splits = [t for t in text_splits if t]

    text_splits_droplast = text_splits[:-1]
    sentences = text_splits_droplast[::2]
    punctuations = text_splits_droplast[1::2]

    if not text_splits or not sentences or not punctuations:
        return []

    ls = []
    for i, j in zip(sentences, punctuations):
        ls.append(i + j)
    if text_splits[-1] != '':
        ls.append(text_splits[-1] + '。')
    
    return ls
    
def process_text(j):
    ls = []
    text_ls = split_text(j['text'][0])
    text_cnt = len(text_ls)
    
    prompt = '根据上下文情景,尽可能真实地、详细地补全下面一段文本的后续部分。\n文本:'
    
    p = 0.3
    split_num = text_cnt - math.ceil(text_cnt * p)

    input_text = prompt + text_ls[0]
    for text in text_ls[1 : split_num]:
        input_text += text
    
    output_text = text_ls[split_num]
    if split_num + 1 < text_cnt: 
        for text in text_ls[split_num + 1 : ]:
            output_text += text
            
    ls.append(
        {
            'instruction': input_text,
            'response': output_text,
            'data_source': j['data_source'],
            'model_arch': j['model_arch'],
            'id': j['id']
        }
    )
            
    return ls

i = 0
ls = []
with open(save_file, 'w') as f:
    for json_data in tqdm(j_ls, desc="formatting.."):
        text_cnt = len(json_data['text'])
        ls.append(text_cnt)
        text = json_data['text'][0]
        if not text.strip('。.!!??') or split_text(text) == []:
            continue
        for sample in process_text(json_data):
            f.write(json.dumps(sample, ensure_ascii=False) + '\n')

print('num of text_cnt == 1 is ', i)
print('min/max len of ls', min(ls), max(ls))

Pandas

实现 SQL 中的 row_number 排序(分组排序、递增排序)

  • pandas 实现 sql 中的row_number,dense_rank,rank

Jupyter Notebook

查看正在run的每个notebook
  • jupyter notebook list:可以查看正在run的每个notebook,可以查询其token

你可能感兴趣的:(Python,python)