




2,掩蔽词masked language model

from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("what a [MASK] blog.")
[{'score': 0.32585206627845764, 'token': 2307, 'token_str': 'great', 'sequence': 'what a great blog.'}, {'score': 0.04082264378666878, 'token': 3376, 'token_str': 'beautiful', 'sequence': 'what a beautiful blog.'}, {'score': 0.040087465196847916, 'token': 8403, 'token_str': 'lovely', 'sequence': 'what a lovely blog.'}, {'score': 0.03804076090455055, 'token': 6919, 'token_str': 'wonderful', 'sequence': 'what a wonderful blog.'}, {'score': 0.028535649180412292, 'token': 2204, 'token_str': 'good', 'sequence': 'what a good blog.'}]

获得sentence embedding,当然也有sentence Transform那个库,这里也可以的。

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "what a great blog I have seen."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
torch.Size([1, 12, 768])



from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sentence = "what a great blog I have seen in China!"
tokens = tokenizer.tokenize(sentence)
>>> tokens
['what', 'a', 'great', 'blog', 'i', 'have', 'seen', 'in', 'china', '!']
>>> ids
[2054, 1037, 2307, 9927, 1045, 2031, 2464, 1999, 2859, 999]
>>> strings
'what a great blog i have seen in china!'


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
>>> tokens
['what', 'a', 'great', 'blog', 'I', 'have', 'seen', 'in', 'China', '!']
>>> ids
[1184, 170, 1632, 10679, 146, 1138, 1562, 1107, 1975, 106]
>>> strings
'what a great blog I have seen in China!'

NER 实体识别就是实体分类(entity token classification)

types of entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC).


O Outside of a named entity
B-MIS Beginning of a miscellaneous entity right after another miscellaneous entity
I-MIS Miscellaneous entity
B-PER Beginning of a person’s name right after another person’s name
I-PER Person’s name
B-ORG Beginning of an organization right after another organization
I-ORG organization
B-LOC Beginning of a location right after another location
I-LOC Location


from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Xiaoming and I live in Beijing, I love you forver !"
ner_results = nlp(example)
[{'entity': 'B-PER', 'score': 0.9986733, 'index': 4, 'word': 'Xiao', 'start': 11, 'end': 15}, {'entity': 'I-PER', 'score': 0.9260187, 'index': 5, 'word': '##ming', 'start': 15, 'end': 19}, {'entity': 'B-LOC', 'score': 0.99963117, 'index': 10, 'word': 'Beijing', 'start': 34, 'end': 41}]


当然也有较为基础的model,直接来用 (如果没有实体词,则为[])

from transformers import pipeline
ner_pipe = pipeline("ner")
sequence = """I'm not the best but the great in recommondation system at Beijing, now I'm looking forward to your kind reply about the offer, if you have any question about job or my work, please contact me without hesitation, I will give you answer in time. 
Any other problem you can join in the QQ group 277356808 !"""
>>> ner_pipe(sequence)
[{'entity': 'I-LOC', 'score': 0.99978286, 'index': 17, 'word': 'Beijing', 'start': 59, 'end': 66}]

4,情感分析(sentiment analysis),发现仅仅就是分类问题,就是分词后对其中具有positive,negative的词进行标记,分类,这就是情感分析,仅此而已。


from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love you forver !")
>>> result
[{'label': 'POSITIVE', 'score': 0.9998611211776733}]


from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
text = "what a great blog I have ever seen in China!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) positive 0.9859
2) neutral 0.0116
3) negative 0.0025


from transformers import AutoModelWithLMHead, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")
def summarize(text, max_length=150):
  input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
  generated_ids = model.generate(input_ids=input_ids, num_beams=2, max_length=max_length,  repetition_penalty=2.5, length_penalty=1.0, early_stopping=True)
  preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
  return preds[0]
>>> summarize('After the sound and the fury, weeks of demonstrations and anguished calls for racial justice, the man whose death gave rise to an international movement, and whose last words — “I can’t breathe” — have been a rallying cry, will be laid to rest on Tuesday at a private funeral in Houston.George Floyd, who was 46, will then be buried in a grave next to his mother’s.The service, scheduled to begin at 11 a.m. at the Fountain of Praise church, comes after five days of public memorials in Minneapolis, North Carolina and Houston and two weeks after a Minneapolis police officer was caught on video pressing his knee into Mr. Floyd’s neck for nearly nine minutes before Mr. Floyd died. That officer, Derek Chauvin, has been charged with second-degree murder and second-degree manslaughter. His bail was set at $1.25 million in a court appearance on Monday. The outpouring of anger and outrage after Mr. Floyd’s death — and the speed at which protests spread from tense, chaotic demonstrations in the city where he died to an international movement from Rome to Rio de Janeiro — has reflected the depth of frustration borne of years of watching black people die at the hands of the police or vigilantes while calls for change went unmet.', 80)

"at a private funeral in Houston on Tuesday. Floyd, who was 46, died of multiple organ failure last month. A Minnesota police officer was caught on video pressing his knee into Mr. Floyd’s neck for nearly nine minutes before he died. A Minneapolis police officer has been charged with second-degree murder and manslaughter. Floyd's bail was set at $1"

