From 3df3041caf2f315e64cf5e8393416090a7e488e5 Mon Sep 17 00:00:00 2001
From: undercrater <zhaoyulu1@huawei.com>
Date: Mon, 21 Jul 2025 21:09:52 +0800
Subject: [PATCH] =?UTF-8?q?feat:Evaluator=20=E8=AF=84=E4=BC=B0=E8=8A=82?=
 =?UTF-8?q?=E7=82=B9=E5=8A=9F=E8=83=BD=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/manager/nodes.py          | 28 ++++++++----
 src/manager/search_context.py |  1 +
 src/prompts/evaluator.md      | 61 ++++++++++++++++++++++++++
 src/report/evaluator.py       | 81 +++++++++++++++++++++++++++++++++++
 src/report/report.py          |  6 +++
 5 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 src/prompts/evaluator.md
 create mode 100644 src/report/evaluator.py

diff --git a/src/manager/nodes.py b/src/manager/nodes.py
index bb23ddd..7e8c414 100644
--- a/src/manager/nodes.py
+++ b/src/manager/nodes.py
@@ -22,6 +22,7 @@ from src.prompts import apply_system_prompt
 from src.query_understanding.planner import Planner
 from src.query_understanding.router import classify_query
 from src.report import Reporter, ReportLang, ReportFormat, ReportStyle
+from src.report.evaluator import Evaluator
 from src.retrieval.collector import Collector
 
 logger = logging.getLogger(__name__)
@@ -91,23 +92,27 @@ def research_manager_node(context: SearchContext, config: RunnableConfig) -> Com
         logger.info(f"Has executed {plan_executed_num} plans, go to next plan reasoning")
         return Command(update={"plan_executed_num": plan_executed_num}, goto="plan_reasoning")
 
+    report_generated_num = context.get("report_generated_num", 0)
+    max_report_generated_num = config.get("configurable", {}).get("max_report_generated_num", 0)
+    if report_generated_num >= max_report_generated_num:
+        logger.info(f"reached max generation num: {max_report_generated_num}")
+        return Command(goto="__end__")
+
     # The report has been generated, and if the report_evaluation is empty, go to evaluator,
     report_evaluation = context.get("report_evaluation", "")
     if report_evaluation == "":
         return Command(goto="evaluator")
 
     # If the report_evaluation is "pass", terminate, otherwise, re-execute the plan
-    if report_evaluation == "pass":
+    evaluation_result, next_step = report_evaluation.split()
+    if evaluation_result == "pass":
         logger.info(f"report evaluation passed")
         return Command(goto="__end__")
 
     logger.info(f"report evaluation not pass")
-    report_generated_num = context.get("report_generated_num", 0)
-    max_report_generated_num = config.get("configurable", {}).get("max_report_generated_num", 0)
-    if report_generated_num >= max_report_generated_num:
-        logger.info(f"reached max generation num: {max_report_generated_num}")
-        return Command(goto="__end__")
-    return Command(goto="plan_reasoning")
+    if next_step != "plan_reasoning" and next_step != "reporter":
+        next_step = "plan_reasoning"
+    return Command(update={"report": "", "report_evaluation": ""},goto=next_step)
 
 
 async def info_collector_node(context: SearchContext, config: RunnableConfig) -> Command:
@@ -200,7 +205,14 @@ def reporter_node(context: SearchContext, config: RunnableConfig) -> Command:
 
 def evaluator_node(context: SearchContext, config: RunnableConfig) -> Command:
     logger.info(f"start evaluator node: \n{context}")
+
+    evaluator = Evaluator()
+    result, info = evaluator.evaluate_report(context, config)
     return Command(
-        update={"report_evaluation": "pass"},
+        update={
+            "report_evaluation": result,
+            "evaluation_details": info,
+            "messages": [HumanMessage(content=info, name="evaluator")],
+        },
         goto="research_manager",
     )
diff --git a/src/manager/search_context.py b/src/manager/search_context.py
index 88b017b..831adee 100644
--- a/src/manager/search_context.py
+++ b/src/manager/search_context.py
@@ -52,3 +52,4 @@ class SearchContext(MessagesState):
     report: str = ""
     report_generated_num: int = 0
     report_evaluation: str = ""
+    evaluation_details: str = ""
diff --git a/src/prompts/evaluator.md b/src/prompts/evaluator.md
new file mode 100644
index 0000000..2576d20
--- /dev/null
+++ b/src/prompts/evaluator.md
@@ -0,0 +1,61 @@
+---
+CURRENT TIME: {{CURRENT_TIME}}
+---
+
+# Report Evaluation Agent
+
+## Role
+- You are a strict professional report reviewer, and proficient in the evaluation criteria of different types of reports. 
+- Your main responsibility is based on the given report title, evaluating whether the quality of a report meets the standards from multiple aspects.
+- Instead of recognizing the value of the report, your goal is striving to identify shortcomings of the report, and give it a ``failed``; Even if you think this report has already meet the standard
+
+## Input
+- Title: The original title of the report that user wants to generate
+- Report: The whole report that you need to evaluate
+
+## Steps
+
+There are some necessary dimensions you need to evaluate for a report.
+And you are encouraged to increase the evaluation dimensions for a more comprehensive result.
+
+### Relevance
+
+- The content of the report should be strongly relevant to the original title.
+
+### Richness of content
+
+- The content of the report should be detailed, comprehensive, and in-depth
+
+### Readability
+
+- The report should be well-structured with clear sections and subsections. If involves comparisons and listings, it is advised to utilize tables for presentation. 
+
+### Compliance
+
+- If users have specific requirements for the report, they must be strictly adhered to.
+
+### Next Step
+
+- If evaluation result is ``pass``, its value should be ``end``.
+- When evaluation result is ``failed``, you should determine who should be responsible for the failure and regenerating the report
+- If the content of the report fails evaluation, like the report is off-topic or information is too little, ``Next Step`` should be ``plan_reasoning``. It will recollect information from the beginning.
+- if the content meets requirements, but it is structural confusion, or it is not generated in accordance with the user's requirements, ``Next Step`` should be ``reporter``. It will regenerate the report by current information
+- Maybe both of the aforementioned two points don't meet standards, you should evaluate which one is more severe
+
+## OutPuts
+- Your result is expected to be a json string, with multi key-value data.
+- The key should be one aspect that you need to evaluate. And the value should be the corresponding evaluation result
+- For example, your output is expected to be like this:
+```json
+{
+  "Relevance": "The report is completely off-topic and unrelated to the title",
+  "Richness of content": "The report contains extremely insufficient information related to the stated topic",
+  "Readability": "The report is well-structured",
+  "Compliance": "The report meets user's requirement that must exceeds 1000 words",
+  "Overall Evaluation": "failed",
+  "Next Step": "plan_reasoning"
+}
+```
+- Attention: `` ```json ``is just used to show a json format data in markdown file. Do not add this  `` ```json `` in your output
+- ``Richness of content``is the most crucial indicator when evaluating a report, it needs to be evaluated by the highest standards, even if you think the current report already meets the standard, you should still try to point out some shortcomings
+- You are encouraged to increase the dimensions of evaluation result, but The  `` Overall Evaluation `` must be set and the value only can be chosen from `` pass `` or `` failed ``. And the ``Next step`` must be chosen from ``end`` or `` plan_reasoning ``or`` reporter ``
\ No newline at end of file
diff --git a/src/report/evaluator.py b/src/report/evaluator.py
new file mode 100644
index 0000000..ce1f777
--- /dev/null
+++ b/src/report/evaluator.py
@@ -0,0 +1,81 @@
+# ******************************************************************************
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# jiuwen-deepsearch is licensed under Mulan PSL v2.
+# You can use this software according to the terms and conditions of the Mulan PSL v2.
+# You may obtain a copy of Mulan PSL v2 at:
+#          http://license.coscl.org.cn/MulanPSL2
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
+# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
+# See the Mulan PSL v2 for more details.
+# ******************************************************************************/
+import json
+import logging
+
+from langsmith import EvaluationResult
+
+from src.query_understanding.planner import Planner
+from langchain_core.messages import HumanMessage, AIMessage
+from langchain_core.runnables import RunnableConfig
+
+from src.config.configuration import Configuration
+from src.llm.llm_wrapper import LLMWrapper
+from src.manager.search_context import SearchContext, Plan
+from src.prompts import apply_system_prompt
+from src.report import ReportFormat
+
+logger = logging.getLogger(__name__)
+
+class Evaluator :
+    def __init__(self):
+        self._llm = LLMWrapper("basic")
+
+    def evaluate_report(self, context: SearchContext, config: RunnableConfig):
+        
+        llm_input = apply_system_prompt("evaluator", context, config)
+        current_plan = context.get("current_plan")
+        if current_plan and current_plan.title :
+            llm_input.append(HumanMessage(
+                f"# The user's original report title: \n\n## Title\n\n{current_plan.title}\n\n##"
+            ))
+        else :
+            logger.error("Failed to find current plan in search context.")
+            return "failed", "cannot find current plan in search context"
+
+        report = context.get("report")
+        if report:
+            llm_input.append(HumanMessage(
+                f"# The result report that you need to evaluate: \n\n## Report\n\n{report}\n\n##"
+            ))
+        else:
+            logger.error("Failed to find report in search context.")
+            return "failed", "cannot find report in search context"
+        logger.debug(f"evaluator prompts: {llm_input}")
+
+        llm_output = self._llm.invoke(llm_input)
+        llm_result = llm_output.content
+        data = json.loads(llm_result)
+        evaluate_result = data['Overall Evaluation'] + " " + data['Next Step']
+
+        evaluate_info = f"Report evaluation finish, result is: {evaluate_result}, detailed infomation is: {llm_result}"
+        logger.info(f'evaluation information: {evaluate_info}')
+        return evaluate_result, evaluate_info
+
+
+if __name__ == "__main__":
+    plan = Plan(title="大熊猫生活习性报告",thought="",is_research_completed=False)
+
+    context = SearchContext(
+        messages=[HumanMessage(content = f'大熊猫生活习性报告,主体部分需要总结为三个段落，字数800字以上', role = "planner"),
+                  AIMessage(content="苹果手机销售总结报告如下：2024年仍占据市场第一", role = "reporter")],
+        current_plan=plan,
+        report="苹果手机销售总结报告如下：2024年仍占据市场第一"
+    )
+    config = RunnableConfig({
+        "configurable":
+            {
+                "language":"zh-CN"
+            }
+    })
+    evaluator = Evaluator()
+    result, info = evaluator.evaluate_report(context,config)
diff --git a/src/report/report.py b/src/report/report.py
index 6654f75..88aaa43 100644
--- a/src/report/report.py
+++ b/src/report/report.py
@@ -64,6 +64,12 @@ class Reporter:
                 f"The following is the information collected during the task processing:\n\n{info}"
             ))
 
+        evaluation_details = context.get("evaluation_details", "")
+        if evaluation_details != "":
+            llm_input.append(HumanMessage(
+                f"The following is the evaluation result for your report last time, you are advised to focus on these opinion:\n\n{evaluation_details}"
+            ))
+
         try:
             logger.debug(f"reporter prompts: {llm_input}")
             llm_output = self._llm.invoke(llm_input)
-- 
Gitee