Deep Learning Enabled Semantic Communication Systems
1.函数将输入的字符串标准化为一个符合规范的文本字符串(处理空格特殊符号);
2.文本分割成指定长;
3.添加开始结束点;
4.筛选符号;
5.构建词汇表(vocabulary),统计句子中每个词出现的个数
import unicodedata
import re
from w3lib.html import remove_tags
import pickle
import argparse
import os
import json
from tqdm import tqdm
from parameters import para_config
# parser = argparse.ArgumentParser()
# parser.add_argument('--input-data-dir', default='europarl/en', type=str)
# parser.add_argument('--output-train-dir', default='europarl/train_data.pkl', type=str)
# parser.add_argument('--output-test-dir', default='europarl/test_data.pkl', type=str)
# parser.add_argument('--output-vocab', default='europarl/vocab.json', type=str)
SPECIAL_TOKENS = {
'' : 0,
'' : 1,
'' : 2,
'' : 3,
}
def unicode_to_ascii(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
# 函数将输入的字符串标准化为一个符合规范的文本字符串
def normalize_string(s):
# normalize unicode characters
s = unicode_to_ascii(s)
# remove the XML-tags
s = remove_tags(s)
# add white space before !.?
# 在感叹号、句号和问号前添加一个空格,以便将它们与前面的单词分开。它使用正则表达式将这些标点符号前添加一个空格。
s = re.sub(r'([!.?])', r' \1', s)
# 除了字母(大小写)、句号、感叹号和问号之外的所有字符替换为空格
s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
# 这一步将连续出现的多个空格替换为单个空格。它使用正则表达式\s + 匹配连续出现的多个空格,并将它们替换为单个空格
s = re.sub(r'\s+', r' ', s)
# change to lower letter
# 将字符串s中的所有字母转换为小写字母
s = s.lower()
return s
# 对清理过的文本进行切割处理,筛选出指定长度范围内的句子。这可能有助于过滤掉过短或过长的句子,以便后续处理或分析
def cutted_data(cleaned, MIN_LENGTH=4, MAX_LENGTH=30):
# 创建一个空列表cutted_lines,用于存储切割后的句子。
cutted_lines = list()
# 遍历输入的清理过的句子列表cleaned中的每个句子
for line in cleaned:
# 计算当前句子line中的单词数,并将结果存储在变量length中
length = len(line.split())
# 判断当前句子的单词数是否大于MIN_LENGTH且小于MAX_LENGTH
if length > MIN_LENGTH and length < MAX_LENGTH:
# 将当前句子line按空格切分成单词,并将结果存储在列表line中
line = [word for word in line.split()]
# 将切分后的单词列表line重新组合成一个句子,并将该句子添加到cutted_lines
cutted_lines.append(' '.join(line))
return cutted_lines
def save_clean_sentences(sentence, save_path):
pickle.dump(sentence, open(save_path, 'wb'))
print('Saved: %s' % save_path)
def process(text_path):
fop = open(text_path, 'r', encoding='utf8')
raw_data = fop.read()
# 将读取到的内容去除首尾的空白字符,并按换行符('\n')
# 进行分割,将文本划分为多个句子,保存在变量sentences中。
sentences = raw_data.strip().split('\n')
# 对每个句子应用normalize_string函数,将句子进行标准化处理,去除XML标签、添加空格等
raw_data_input = [normalize_string(data) for data in sentences]
# 筛选出长度在指定范围内的句子
raw_data_input = cutted_data(raw_data_input)
fop.close()
return raw_data_input
# 用于将一个字符串 s 切分成一个(字符串)标记列表,即将字符串按指定的分隔符进行分割,
# 并可选择保留或移除特定的标点符号,以及添加起始和结束标记
def tokenize(s, delim=' ', add_start_token=True, add_end_token=True,
punct_to_keep=None, punct_to_remove=None):
"""
Tokenize a sequence, converting a string s into a list of (string) tokens by
splitting on the specified delimiter. Optionally keep or remove certain
punctuation marks and add start and end tokens.
"""
if punct_to_keep is not None:
for p in punct_to_keep:
s = s.replace(p, '%s%s' % (delim, p))
if punct_to_remove is not None:
for p in punct_to_remove:
s = s.replace(p, '')
tokens = s.split(delim)
if add_start_token:
tokens.insert(0, '' )
if add_end_token:
tokens.append('' )
return tokens
# 构建词汇表(vocabulary)
def build_vocab(sequences, token_to_idx = { }, min_token_count=1, delim=' ',
punct_to_keep=None, punct_to_remove=None, ):
# 记录词汇表中每个单词的出现次数
token_to_count = {}
# 遍历sequences列表中的每个句子seq
for seq in sequences:
# 使用了tokenize函数进行分词操作
seq_tokens = tokenize(seq, delim=delim, punct_to_keep=punct_to_keep,
punct_to_remove=punct_to_remove,
add_start_token=False, add_end_token=False)
# 统计每个词在所有句子中出现的频次
for token in seq_tokens:
if token not in token_to_count:
token_to_count[token] = 0
token_to_count[token] += 1
# 出现次数不低于min_token_count的单词才会被添加到token_to_idx
# 字典中,并赋予一个唯一的索引值。
for token, count in sorted(token_to_count.items()):
if count >= min_token_count:
token_to_idx[token] = len(token_to_idx)
# {'': 0, '': 1, '': 2, '': 3, '': 4, 'a': 5, 'abstentions': 6, 'accordance': 7, 'add': 8,
# 'adopted': 9}
return token_to_idx
def encode(seq_tokens, token_to_idx, allow_unk=False):
seq_idx = []
for token in seq_tokens:
if token not in token_to_idx:
if allow_unk:
token = ''
else:
raise KeyError('Token "%s" not in vocab' % token)
seq_idx.append(token_to_idx[token])
return seq_idx
def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
tokens = []
for idx in seq_idx:
tokens.append(idx_to_token[idx])
if stop_at_end and tokens[-1] == '' :
break
if delim is None:
return tokens
else:
return delim.join(tokens)
def main(args):
data_dir = '/home/hx301/data/'
# args.input_data_dir = args.input_data_dir
# args.output_train_dir = args.output_train_dir
# args.output_test_dir = args.output_test_dir
# args.output_vocab = args.output_vocab
print(args.input_data_dir)
sentences = []
print('Preprocess Raw Text')
for fn in tqdm(os.listdir(
args.input_data_dir)):
if not fn.endswith('.txt'): continue #直到找到以.txt结尾的文件
process_sentences = process(os.path.join(args.input_data_dir, fn))
# 将列表process_sentences中的元素添加到列表sentences中的一种简写方式。
# 它实际上是将process_sentences列表中的元素逐个追加到sentences列表的末尾。
sentences += process_sentences
# remove the same sentences
# 使用字典a统计sentence列表中的句子出现的次数,并删除重复的句子。最终,将去重后的句子存储在sentence列表中
a = {}
for set in sentences:
if set not in a:
a[set] = 0
a[set] += 1
sentences = list(a.keys())
print('Number of sentences: {}'.format(len(sentences)))
print('Build Vocab')
token_to_idx = build_vocab(
sentences, SPECIAL_TOKENS,
punct_to_keep=[';', ','], punct_to_remove=['?', '.']
)
vocab = {'token_to_idx': token_to_idx}
print('Number of words in Vocab: {}'.format(len(token_to_idx)))
# save the vocab
if args.output_vocab != '':
with open(args.output_vocab, 'w') as f:
json.dump(vocab, f)
print('Start encoding txt')
results = []
count_len = []
#重新处理每个句子,根据上面对每个词赋予的token_to_idx,将句子转换成数字列表
for seq in tqdm(sentences):
words = tokenize(seq, punct_to_keep=[';', ','], punct_to_remove=['?', '.'])
tokens = [token_to_idx[word] for word in words]
count_len.append(len(tokens))
results.append(tokens)
print('Writing Data')
train_data = results[: round(len(results) * 0.9)]
test_data = results[round(len(results) * 0.9):]
# 训练集:测试集=9:1
with open(args.output_train_dir, 'wb') as f:
pickle.dump(train_data, f)
with open(args.output_test_dir, 'wb') as f:
pickle.dump(test_data, f)
if __name__ == '__main__':
# Set Parameters
args = para_config()
main(args)
{"token_to_idx": {"" : 0, "" : 1, "" : 2, "" : 3, "": 4, "a": 5, "abstentions": 6, "accordance": 7, "add": 8, "adopted": 9, "advertising": 10, "advisers": 11, "against": 12, "agenda": 13, "agriculture": 14, "all": 15, "allowances": 16, "already": 17, "always": 18, "amended": 19, "amendment": 20, "amendments": 21, "among": 22, "and": 23, "any": 24, "applause": 25, "appreciation": 26, "are": 27, "as": 28, "at": 29, "be": 30, "been": 31, "behalf": 32, "being": 33, "business": 34, "but": 35, "by": 36, "can": 37, "capital": 38, "card": 39, "cards": 40, "case": 41, "clearly": 42, "closed": 43, "closely": 44, "cohesion": 45, "coming": 46, "commandment": 47, "commend": 48, "commission": 49, "commissioner": 50, "committee": 51, "compliments": 52, "conclusions": 53, "continue": 54, "coordination": 55, "counted": 56, "creation": 57, "dangerous": 58, "de": 59, "debate": 60, "declared": 61, "depth": 62, "development": 63, "do": 64, "economic": 65, "electronically": 66, "elements": 67, "entitled": 68, "especially": 69, "european": 70, "event": 71, "facts": 72, "familiar": 73, "favour": 74, "few": 75, "finally": 76, "financing": 77, "first": 78, "for": 79, "forget": 80, "forgotten": 81, "from": 82, "fund": 83, "funds": 84, "give": 85, "gladly": 86, "goods": 87, "group": 88, "has": 89, "have": 90, "hear": 91, "her": 92, "his": 93, "house": 94, "i": 95, "if": 96, "in": 97, "include": 98, "included": 99, "indeed": 100, "into": 101, "is": 102, "it": 103, "its": 104, "job": 105, "just": 106, "keeping": 107, "koch": 108, "last": 109, "least": 110, "let": 111, "letter": 112, "like": 113, "link": 114, "logical": 115, "look": 116, "m": 117, "madam": 118, "made": 119, "main": 120, "make": 121, "mandate": 122, "member": 123, "members": 124, "mention": 125, "meticulous": 126, "minute": 127, "more": 128, "mr": 129, "mrs": 130, "much": 131, "my": 132, "necessary": 133, "no": 134, "not": 135, "noted": 136, "now": 137, "objectives": 138, "observed": 139, "of": 140, "on": 141, "oral": 142, "order": 143, "other": 144, "p": 145, "parliament": 146, "party": 147, "perfectly": 148, "period": 149, "piece": 150, "place": 151, "pleased": 152, "poettering": 153, "point": 154, "political": 155, "positions": 156, "ppe": 157, "presented": 158, "presently": 159, "presidency": 160, "president": 161, "principles": 162, "proceed": 163, "proposal": 164, "propose": 165, "pse": 166, "question": 167, "quite": 168, "rapporteur": 169, "reasonable": 170, "received": 171, "regarding": 172, "regions": 173, "reinstated": 174, "reiterate": 175, "rejected": 176, "remain": 177, "repeat": 178, "report": 179, "request": 180, "requests": 181, "result": 182, "road": 183, "room": 184, "rose": 185, "rural": 186, "s": 187, "safety": 188, "schroedter": 189, "segni": 190, "shall": 191, "she": 192, "should": 193, "silence": 194, "since": 195, "sitting": 196, "situation": 197, "so": 198, "social": 199, "socialists": 200, "speak": 201, "speakers": 202, "speaking": 203, "starting": 204, "statement": 205, "strategic": 206, "structural": 207, "substantive": 208, "suggestions": 209, "support": 210, "tabled": 211, "take": 212, "tax": 213, "thank": 214, "that": 215, "the": 216, "their": 217, "themselves": 218, "then": 219, "there": 220, "therefore": 221, "this": 222, "thursday": 223, "thus": 224, "time": 225, "to": 226, "tomorrow": 227, "too": 228, "topical": 229, "transport": 230, "two": 231, "unable": 232, "understood": 233, "union": 234, "upheld": 235, "urgent": 236, "very": 237, "vote": 238, "votes": 239, "voting": 240, "was": 241, "we": 242, "when": 243, "which": 244, "who": 245, "wholehearted": 246, "whose": 247, "will": 248, "willing": 249, "wishes": 250, "with": 251, "withdrawn": 252, "work": 253, "would": 254, "wurtz": 255, "yes": 256, "you": 257, "your": 258}}