前面已经熟悉了相关博文,fastTEXT,纠错/MLM/NER/情感分析/摘要,本文将继续介绍基本业务。
6,给定文本,做阅读理解题,也即Extractive Question Answering:extracting an answer from a text given a question 。相关数据集为SQuAD。
from transformers import pipeline
question_answerer = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""
>>> result = question_answerer(question="What is extractive question answering?", context=context)###277356808 QQ group
>>> result
{'score': 0.6177276968955994, 'start': 34, 'end': 95, 'answer': 'the task of extracting an answer from a text given a question'}
>>> question_answerer(question="What is a good example of a question answering dataset?", context=context )
{'score': 0.5152317881584167, 'start': 147, 'end': 160, 'answer': 'SQuAD dataset'}
多个问答也可做,也可换用更大数据集的预训练模型
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch #QQ group 277356808
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
text = r"""
Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = [
"How many pretrained models are available in Transformers?",
"What does Transformers provide?",
" Transformers provides interoperability between which frameworks?",
]
for question in questions:
inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
# Get the most likely beginning of answer with the argmax of the score
answer_start = torch.argmax(answer_start_scores)
# Get the most likely end of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
)
print(f"Question: {question}")
print(f"Answer: {answer}")
7,Causal Language Modeling因果语言模型
from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
import torch #QQ group 277356808
from torch import nn
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
sequence = f"Hugging Face is based in DUMBO, New York City, and"
inputs = tokenizer(sequence, return_tensors="pt")
input_ids = inputs["input_ids"]
# get logits of last hidden state
next_token_logits = model(**inputs).logits[:, -1, :]
# filter
filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
# sample
probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated = torch.cat([input_ids, next_token], dim=-1)
resulting_string = tokenizer.decode(generated.tolist()[0])
print(resulting_string)
Hugging Face is based in DUMBO, New York City, and is
这个效果有点差哦(留个坑),这是啥子哟。调整top_k_top_p_filtering中的参数并没有啥用,num_samples是下面token的个数。这也是属于text Generation里面的了。
8,文本生成Text Generation
from transformers import pipeline
text_generator = pipeline("text-generation")
>>> print(text_generator("I love you, I will", max_length=10, do_sample=False))
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
[{'generated_text': 'I love you, I will never forget you.'}]
有prompt(提示)及padding(背景)时,文本生成更好做,
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. """
prompt = "Today the weather is really nice and I am planning on "
inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
prompt_length = len(tokenizer.decode(inputs[0]))
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
print(generated)
>>> generated
"Today the weather is really nice and I am planning on going to a lovely outdoor restaurant. I'm feeling very happy, that's why I'm going to you right now, because I need to get a job for a few more days before I can go out and get a little more paid. I'm just ready to go. If I'm not sure what I can do, it's all important to me."
9,翻译translation
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
article_zh ="我爱你到永远。"
article_en ="I love you forver!"
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(
**encoded_hi,
forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."
# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(
**encoded_ar,
forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "The Secretary-General of the United Nations says there is no military solution in Syria."
#QQ group 277356808
tokenizer.src_lang = "zh_CN"
encoded_zh = tokenizer(article_zh, return_tensors="pt")
generated_tokens = model.generate(
**encoded_zh,
forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
#['I love you forever.']
tokenizer.src_lang = "en_XX"
encoded_en = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(
**encoded_en,
forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
#['I love you forever.']
欢迎关注本专栏,持续更新!