from langchain.evaluation import load_evaluator
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv # 导入从 .env 文件加载环境变量的函数
load_dotenv() # 调用函数实际加载环境变量
# from langchain.globals import set_debug # 导入在 langchain 中设置调试模式的函数
# set_debug(True) # 启用 langchain 的调试模式
from langchain.evaluation import load_evaluator
from langchain.chat_models import ChatOpenAI
evaluator = load_evaluator("labeled_score_string", llm=ChatOpenAI(model="gpt-3.5-turbo"))
# Correct
eval_result = evaluator.evaluate_strings(
prediction="You can find them in the dresser's third drawer.",
reference="The socks are in the third drawer in the dresser",
input="Where are my socks?",
print("Correct: ", eval_result)
(.venv) ~/Workspace/LLM/langchain-llm-app/ [develop+*] python Evaluate/ ⏎
Correct: {'reasoning': "Explanation:\nThe assistant's response is helpful and relevant to the user's question. It provides a concise and accurate answer, directing the user to find their socks in the third drawer of the dresser. The response is correct and factual, as it accurately refers to the location of the socks. While the response does not demonstrate depth of thought, it effectively addresses the user's query.\n\nRating: [[8]]", 'score': 8}
accuracy_criteria = {
"accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
evaluator = load_evaluator(
# Correct
eval_result = evaluator.evaluate_strings(
prediction="You can find them in the dresser's third drawer.",
reference="The socks are in the third drawer in the dresser",
input="Where are my socks?",
(.venv) ~/Workspace/LLM/langchain-llm-app/ [develop+*] python Evaluate/
Correct: {'reasoning': 'Explanation: The assistant accurately states that the socks can be found in the third drawer of the dresser, which aligns perfectly with the reference. There are no errors or omissions in the response.\n\nRating: [[10]]', 'score': 10}
# Correct but lacking information
eval_result = evaluator.evaluate_strings(
prediction="You can find them in the dresser.",
reference="The socks are in the third drawer in the dresser",
input="Where are my socks?",
(.venv) ~/Workspace/LLM/langchain-llm-app/ [develop+*] python Evaluate/
Correct but lacking information >>> /n {'reasoning': "Explanation: The assistant's response is partially accurate as it correctly mentions that the socks can be found in the dresser. However, it does not provide specific information about the location of the socks in the dresser. \n\nRating: [[7]]", 'score': 7}
# Incorrect
eval_result = evaluator.evaluate_strings(
prediction="You can find them in the dog's bed.",
reference="The socks are in the third drawer in the dresser",
input="Where are my socks?",
(.venv) ~/Workspace/LLM/langchain-llm-app/ [develop+*] python Evaluate/
Incorrect >>> /n {'reasoning': "The AI assistant's response is completely unrelated to the reference. The reference states that the socks are in the third drawer in the dresser, while the assistant suggests they can be found in the dog's bed. \n\nRating: [[1]]", 'score': 1}
evaluator = load_evaluator(
# Correct but lacking information
eval_result = evaluator.evaluate_strings(
prediction="You can find them in the dresser.",
reference="The socks are in the third drawer in the dresser",
input="Where are my socks?",
(.venv) ~/Workspace/LLM/langchain-llm-app/ [develop+*] python Evaluate/
Correct but lacking information >>>> {'reasoning': "Explanation: \nThe AI assistant's response is partially relevant to the user's question. It mentions the location of the socks as being in the dresser, which aligns with the ground truth. However, it lacks the specific information that the socks are in the third drawer. Overall, the response provides some relevant information but has a minor error in omitting the specific drawer location. \n\nRating: [[7]]", 'score': 0.7}