From 3df3041caf2f315e64cf5e8393416090a7e488e5 Mon Sep 17 00:00:00 2001 From: undercrater Date: Mon, 21 Jul 2025 21:09:52 +0800 Subject: [PATCH] =?UTF-8?q?feat:Evaluator=20=E8=AF=84=E4=BC=B0=E8=8A=82?= =?UTF-8?q?=E7=82=B9=E5=8A=9F=E8=83=BD=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/manager/nodes.py | 28 ++++++++---- src/manager/search_context.py | 1 + src/prompts/evaluator.md | 61 ++++++++++++++++++++++++++ src/report/evaluator.py | 81 +++++++++++++++++++++++++++++++++++ src/report/report.py | 6 +++ 5 files changed, 169 insertions(+), 8 deletions(-) create mode 100644 src/prompts/evaluator.md create mode 100644 src/report/evaluator.py diff --git a/src/manager/nodes.py b/src/manager/nodes.py index bb23ddd..7e8c414 100644 --- a/src/manager/nodes.py +++ b/src/manager/nodes.py @@ -22,6 +22,7 @@ from src.prompts import apply_system_prompt from src.query_understanding.planner import Planner from src.query_understanding.router import classify_query from src.report import Reporter, ReportLang, ReportFormat, ReportStyle +from src.report.evaluator import Evaluator from src.retrieval.collector import Collector logger = logging.getLogger(__name__) @@ -91,23 +92,27 @@ def research_manager_node(context: SearchContext, config: RunnableConfig) -> Com logger.info(f"Has executed {plan_executed_num} plans, go to next plan reasoning") return Command(update={"plan_executed_num": plan_executed_num}, goto="plan_reasoning") + report_generated_num = context.get("report_generated_num", 0) + max_report_generated_num = config.get("configurable", {}).get("max_report_generated_num", 0) + if report_generated_num >= max_report_generated_num: + logger.info(f"reached max generation num: {max_report_generated_num}") + return Command(goto="__end__") + # The report has been generated, and if the report_evaluation is empty, go to evaluator, report_evaluation = context.get("report_evaluation", "") if report_evaluation == "": return Command(goto="evaluator") # If the report_evaluation is "pass", terminate, otherwise, re-execute the plan - if report_evaluation == "pass": + evaluation_result, next_step = report_evaluation.split() + if evaluation_result == "pass": logger.info(f"report evaluation passed") return Command(goto="__end__") logger.info(f"report evaluation not pass") - report_generated_num = context.get("report_generated_num", 0) - max_report_generated_num = config.get("configurable", {}).get("max_report_generated_num", 0) - if report_generated_num >= max_report_generated_num: - logger.info(f"reached max generation num: {max_report_generated_num}") - return Command(goto="__end__") - return Command(goto="plan_reasoning") + if next_step != "plan_reasoning" and next_step != "reporter": + next_step = "plan_reasoning" + return Command(update={"report": "", "report_evaluation": ""},goto=next_step) async def info_collector_node(context: SearchContext, config: RunnableConfig) -> Command: @@ -200,7 +205,14 @@ def reporter_node(context: SearchContext, config: RunnableConfig) -> Command: def evaluator_node(context: SearchContext, config: RunnableConfig) -> Command: logger.info(f"start evaluator node: \n{context}") + + evaluator = Evaluator() + result, info = evaluator.evaluate_report(context, config) return Command( - update={"report_evaluation": "pass"}, + update={ + "report_evaluation": result, + "evaluation_details": info, + "messages": [HumanMessage(content=info, name="evaluator")], + }, goto="research_manager", ) diff --git a/src/manager/search_context.py b/src/manager/search_context.py index 88b017b..831adee 100644 --- a/src/manager/search_context.py +++ b/src/manager/search_context.py @@ -52,3 +52,4 @@ class SearchContext(MessagesState): report: str = "" report_generated_num: int = 0 report_evaluation: str = "" + evaluation_details: str = "" diff --git a/src/prompts/evaluator.md b/src/prompts/evaluator.md new file mode 100644 index 0000000..2576d20 --- /dev/null +++ b/src/prompts/evaluator.md @@ -0,0 +1,61 @@ +--- +CURRENT TIME: {{CURRENT_TIME}} +--- + +# Report Evaluation Agent + +## Role +- You are a strict professional report reviewer, and proficient in the evaluation criteria of different types of reports. +- Your main responsibility is based on the given report title, evaluating whether the quality of a report meets the standards from multiple aspects. +- Instead of recognizing the value of the report, your goal is striving to identify shortcomings of the report, and give it a ``failed``; Even if you think this report has already meet the standard + +## Input +- Title: The original title of the report that user wants to generate +- Report: The whole report that you need to evaluate + +## Steps + +There are some necessary dimensions you need to evaluate for a report. +And you are encouraged to increase the evaluation dimensions for a more comprehensive result. + +### Relevance + +- The content of the report should be strongly relevant to the original title. + +### Richness of content + +- The content of the report should be detailed, comprehensive, and in-depth + +### Readability + +- The report should be well-structured with clear sections and subsections. If involves comparisons and listings, it is advised to utilize tables for presentation. + +### Compliance + +- If users have specific requirements for the report, they must be strictly adhered to. + +### Next Step + +- If evaluation result is ``pass``, its value should be ``end``. +- When evaluation result is ``failed``, you should determine who should be responsible for the failure and regenerating the report +- If the content of the report fails evaluation, like the report is off-topic or information is too little, ``Next Step`` should be ``plan_reasoning``. It will recollect information from the beginning. +- if the content meets requirements, but it is structural confusion, or it is not generated in accordance with the user's requirements, ``Next Step`` should be ``reporter``. It will regenerate the report by current information +- Maybe both of the aforementioned two points don't meet standards, you should evaluate which one is more severe + +## OutPuts +- Your result is expected to be a json string, with multi key-value data. +- The key should be one aspect that you need to evaluate. And the value should be the corresponding evaluation result +- For example, your output is expected to be like this: +```json +{ + "Relevance": "The report is completely off-topic and unrelated to the title", + "Richness of content": "The report contains extremely insufficient information related to the stated topic", + "Readability": "The report is well-structured", + "Compliance": "The report meets user's requirement that must exceeds 1000 words", + "Overall Evaluation": "failed", + "Next Step": "plan_reasoning" +} +``` +- Attention: `` ```json ``is just used to show a json format data in markdown file. Do not add this `` ```json `` in your output +- ``Richness of content``is the most crucial indicator when evaluating a report, it needs to be evaluated by the highest standards, even if you think the current report already meets the standard, you should still try to point out some shortcomings +- You are encouraged to increase the dimensions of evaluation result, but The `` Overall Evaluation `` must be set and the value only can be chosen from `` pass `` or `` failed ``. And the ``Next step`` must be chosen from ``end`` or `` plan_reasoning ``or`` reporter `` \ No newline at end of file diff --git a/src/report/evaluator.py b/src/report/evaluator.py new file mode 100644 index 0000000..ce1f777 --- /dev/null +++ b/src/report/evaluator.py @@ -0,0 +1,81 @@ +# ****************************************************************************** +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# jiuwen-deepsearch is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# http://license.coscl.org.cn/MulanPSL2 +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# ******************************************************************************/ +import json +import logging + +from langsmith import EvaluationResult + +from src.query_understanding.planner import Planner +from langchain_core.messages import HumanMessage, AIMessage +from langchain_core.runnables import RunnableConfig + +from src.config.configuration import Configuration +from src.llm.llm_wrapper import LLMWrapper +from src.manager.search_context import SearchContext, Plan +from src.prompts import apply_system_prompt +from src.report import ReportFormat + +logger = logging.getLogger(__name__) + +class Evaluator : + def __init__(self): + self._llm = LLMWrapper("basic") + + def evaluate_report(self, context: SearchContext, config: RunnableConfig): + + llm_input = apply_system_prompt("evaluator", context, config) + current_plan = context.get("current_plan") + if current_plan and current_plan.title : + llm_input.append(HumanMessage( + f"# The user's original report title: \n\n## Title\n\n{current_plan.title}\n\n##" + )) + else : + logger.error("Failed to find current plan in search context.") + return "failed", "cannot find current plan in search context" + + report = context.get("report") + if report: + llm_input.append(HumanMessage( + f"# The result report that you need to evaluate: \n\n## Report\n\n{report}\n\n##" + )) + else: + logger.error("Failed to find report in search context.") + return "failed", "cannot find report in search context" + logger.debug(f"evaluator prompts: {llm_input}") + + llm_output = self._llm.invoke(llm_input) + llm_result = llm_output.content + data = json.loads(llm_result) + evaluate_result = data['Overall Evaluation'] + " " + data['Next Step'] + + evaluate_info = f"Report evaluation finish, result is: {evaluate_result}, detailed infomation is: {llm_result}" + logger.info(f'evaluation information: {evaluate_info}') + return evaluate_result, evaluate_info + + +if __name__ == "__main__": + plan = Plan(title="大熊猫生活习性报告",thought="",is_research_completed=False) + + context = SearchContext( + messages=[HumanMessage(content = f'大熊猫生活习性报告,主体部分需要总结为三个段落,字数800字以上', role = "planner"), + AIMessage(content="苹果手机销售总结报告如下:2024年仍占据市场第一", role = "reporter")], + current_plan=plan, + report="苹果手机销售总结报告如下:2024年仍占据市场第一" + ) + config = RunnableConfig({ + "configurable": + { + "language":"zh-CN" + } + }) + evaluator = Evaluator() + result, info = evaluator.evaluate_report(context,config) diff --git a/src/report/report.py b/src/report/report.py index 6654f75..88aaa43 100644 --- a/src/report/report.py +++ b/src/report/report.py @@ -64,6 +64,12 @@ class Reporter: f"The following is the information collected during the task processing:\n\n{info}" )) + evaluation_details = context.get("evaluation_details", "") + if evaluation_details != "": + llm_input.append(HumanMessage( + f"The following is the evaluation result for your report last time, you are advised to focus on these opinion:\n\n{evaluation_details}" + )) + try: logger.debug(f"reporter prompts: {llm_input}") llm_output = self._llm.invoke(llm_input) -- Gitee