空格不会呗去除
import string
text = 'Hello World! ! !:$%^&'
text = text.translate(str.maketrans('', '', string.punctuation))
>> 'hello world '
englishi -> 英文停用词 chinese -> 中文停用词 不填 -> 所有支持语言的停用词
text = "I am a SuperMan"
text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
>> 'I SuperMan'
text = "我 是 中 过 人 你 好 啊 他 个"
" ".join([word for word in text.split() if word not in stopwords.words()])
>> '中 人 好'
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
text = "been had done languages cities mice"
" ".join([lemmatizer.lemmatize(word) for word in text.split()])
>> 'been had done language city mouse'
key:为每个单词 value:为索引(此处不是词频!!!)
def list2dict(texts):
wordmap = {}
for text in texts:
for word in text.split():
wordmap[word] = wordmap.get(word, 0) + 1
return {w:i+1 for i, w in enumerate(list(hashmap.keys()))}
def calculateMaxSeqLen(texts):
max_len = float('-inf')
for text in texts:
if len(text.split()) > max_len:
max_len = len(text.split())
return max_len
>>> from sklearn.model_selection import train_test_split
>>> f = ['1111', '22222', '3333', '444', '555', '66666', '77777', '8888', '999', '0000']
>>> g = [1, 2,3,4,5,6,7,8,9,0]
>>>
>>>
>>> train_test_split(f, g, test_size=0.2, random_state=1)
[['77777', '555', '1111', '444', '22222', '8888', '999', '66666'], ['3333', '0000'], [7, 5, 1, 4, 2, 8, 9, 6], [3, 0]]
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on
tokenizer.fit_on_sequences( tokenizer.fit_on_texts(
tokenizer.fit_on_texts(["Hello World", "你 好 世 界"])
print(tokenizer.word_counts)
>>> OrderedDict([('hello', 1), ('world', 1), ('你', 1), ('好', 1), ('世', 1), ('界', 1)])
print(tokenizer.word_index)
>>> {'hello': 1, 'world': 2, '你': 3, '好': 4, '世': 5, '界': 6}
seq = tokenizer.texts_to_sequences(['你 好 是 界', '北 京 欢 迎 你'])
print(seq)
>>> [[3, 4, 6], [3]]
# 添0补位操作:pad MAX_LEN:需要满足的长度
MAX_LEN = 5
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=MAX_LEN)
print(seq)
>>> array([[0, 0, 3, 4, 6],
[0, 0, 0, 0, 3]], dtype=int32)
def create_embedding_matrix(word_index, embedding_dict=None, d_model=100):
"""
word_index 词序列
[word1 word2 ...]
embedding_dict 词-词向量的映射
[
word1 0 1 0 0.97 0.23,
word2 0.1 0.21 0 0.97 0.23,
...
]
d_model 每个词被表示为d_model维的向量,如上方例子d_model = len(0 1 0 0.97 0.23) = 5
"""
embedding_matrix = np.zeros((len(word_index) + 1, d_model))
## loop over all the words
for word, index in word_index.items():
if word in embedding_dict:
embedding_matrix[index] = embedding_dict[word]
return embedding_matrix