import csv
with open(path,"r",encoding="utf-8") as f:
reader=csv.reader(f) #csv阅读器 默认分隔符为"," 设置分隔符用delimiter
#reader=csv.reader(f,delimiter="\t")
birth_header=next(reader) #获取首行标签
for row in reader: #遍历文本
...
import csv
f=open("data.csv","w",encoding="utf-8",newline="")
f_writer=csv.writer(f,delimiter=" ") #delimiter记录分隔符
f_writer.writerow(["label","text"]) #列表形式,写入首行标签
for line in lines:
f_writer.writerow(line) #写入数据
...
f.close()
import jsonlines
with open("data.jsonl","w",encoding="utf-8") as f:
pass
with jsonlines.open("data.jsonl",mode="a") as f:
f.write(input_line) #input_line为字典格式
...
or
import json
with open("data.jsonl","w",encoding="utf-8") as f:
f.write(json.dumps(input_line)+"\n") #input_line为字典格式
...
import jsonlines
with open("data.jsonl","r") as f:
lines=jsonlines.Reader(f)
for line in lines: #line为字典格式
...
or
import json
with open("data.jsonl","r") as f:
lines=f.readlines()
for line in lines:
line=json.loads(line) #将数据转换为字典格式
...
import ast
with open("data.txt","r",encoding="utf-8") as f:
lines=f.readlines()
for line in lines: #line: "["id","text","label"]"
line=ast.literal_eval(line) #line: ["id","text","label"]
img=line[0]
...
from emoji import demojize
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
def normalizeToken(token):
lowercased_token = token.lower()
if token.startswith("@"):
return "@USER"
elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
return "HTTPURL"
elif len(token) == 1:
return demojize(token)
else:
if token == "’":
return "'"
elif token == "…":
return "..."
else:
return token
def normalizeTweet(tweet):
tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
normTweet = " ".join([normalizeToken(token) for token in tokens])
normTweet = (
normTweet.replace("cannot ", "can not ")
.replace("n't ", " n't ")
.replace("n 't ", " n't ")
.replace("ca n't", "can't")
.replace("ai n't", "ain't")
)
normTweet = (
normTweet.replace("'m ", " 'm ")
.replace("'re ", " 're ")
.replace("'s ", " 's ")
.replace("'ll ", " 'll ")
.replace("'d ", " 'd ")
.replace("'ve ", " 've ")
)
normTweet = (
normTweet.replace(" p . m .", " p.m.")
.replace(" p . m ", " p.m ")
.replace(" a . m .", " a.m.")
.replace(" a . m ", " a.m ")
)
return " ".join(normTweet.split())
# if __name__ == "__main__":
# print(
# normalizeTweet(
# "SC has first two presumptive cases of coronavirus, DHEC confirms https://postandcourier.com/health/covid19/sc-has-first-two-presumptive-cases-of-coronavirus-dhec-confirms/article_bddfe4ae-5fd3-11ea-9ce4-5f495366cee6.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share… via @postandcourier"
# )
# )
#单个字符串开头或结尾
if string.startswith("http"):
...
if string.endswith(".jpg"):
...
#多种字符串开头或结尾的情况
if word.startswith(("#",'"',"@","“","\\","'")): #用元组包装
...
if word.endswith(("!",".",":",'"',"?",",","\\","”","'")):
...
import re
my_re=re.compile(r'[A-Za-z]',re.S)
lst=re.findall(my_re,word) #返回列表
import re
my_re=re.compile(u'[\u4e00-\u9fa5]')
lst=re.findall(my_re,word) #返回列表
import re
try:
co=re.compile(u'[\U00010000-\U0010ffff]')
except:
co=re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
token=co.sub("",token) #token为单个字符(单词/表情符)
#处理后 若token为None 则原token为表情符
import pandas as pd
glove_model=pd.read_table("glove.twitter.27B.100d.txt",sep=" ",index_col=0, header=None, quoting=csv.QUOTE_NONE) #加载glove文件
word=glove_model.loc[word].values #生成词向量
def fill_sentence(embeddings, embedding_dim):
"""
输入:
embeddings: [sentence.array,sentence.array,...]
embedding_dim: 与embeddings中一致
"""
fill_embeddings = []
length = [len(embedding) for embedding in embeddings] #一个embedding为一个sentence
max_len = max(length)
for embedding in embeddings:
if len(embedding) < max_len:
fill_zero = np.zeros((max_len - len(embedding), embedding_dim))
fill_embedding = np.append(embedding, fill_zero)
fill_embedding = fill_embedding.reshape(-1, embedding_dim) #-1: reshape函数根据另一个参数的维度计算出数组的另外一个shape属性值
fill_embeddings.append(fill_embedding)
else:
fill_embeddings.append(embedding)
return np.array(fill_embeddings)
import os
for root,dirs,files in os.walk(filename): #filename文件夹名
"""
root 表示当前正在访问的文件夹路径
dirs 表示该文件夹下的子目录名list
files 表示该文件夹下的文件list
"""
for file in files: #遍历文件夹下所有文件名
new_filepath=os.path.join(root,file) #创建路径
import random
lines=""
with open("data.jsonl","r",encoding="utf-8") as f:
lines=f.readlines()
index=[i for i in range(len(lines))] #创建index列表
random.shuffle(index) #打乱index
with open("train.jsonl","w",encoding="utf-8") as f:
for i in range(0,3251): #3251为自定义数据集样本数
f.write(lines[index[i]]) #乱序载入数据
...
或者
import numpy as np
np.random.seed(1337)
np.random.shuffle(trains)
过滤掉字符串中不需要的字符:
def word_filter(token): #定义过滤函数,输入token,返回True时,token保留;否则,token过滤掉(删除)。
#删除链接
elif token.startswith("http"):
return False
else:
return True
sentence=[word for word in filter(word_filter,sentence)] #其中sentence为单词列表
import os
book="a/"
if not os.path.exists(book): #判断文件夹是否存在
os.makedir(book) #创建单个文件夹
book="a/b/
if not os.path.exists(book):
os.makedirs(book) #创建多个嵌套文件夹
import os
import shutil
path1="row_path" #原文件夹
path2="path" #目标文件夹
for img in img_list: #假设img_list中为需要移动的文件名
path=os.path.join(path1,img)
despath=os.path.join(path2,img)
shutil.move(path,despath) #文件从地址path变为despath 文件移动
# shutil.move(path,despath) #文件从地址path变为despath 文件复制