day33 Agent 错误恢复与回退策略
xsun_ai_study
错误类型分类与处理策略
错误分类体系
核心恢复策略矩阵
| 策略类型 | 适用场景 | 实现复杂度 | 恢复成功率 |
|---|---|---|---|
| 重试机制 | 临时性错误(网络、API限流) | 低 | 60-80% |
| 降级处理 | 工具不可用、功能缺失 | 中 | 70-90% |
| 熔断机制 | 服务持续故障 | 中 | 90-95% |
| 旁路策略 | 主路径失败 | 高 | 80-95% |
| 人工干预 | 复杂逻辑错误 | 低 | 95-100% |
多层级防御架构
第1层:预防层(Prevention)
classPreventiveMeasures:"""预防性措施"""@staticmethoddefvalidate_input(user_input:str,max_length:int=1000)->ValidationResult:"""输入验证"""checks=[("长度检查",len(user_input)<=max_length),("恶意代码检查",notany(keywordinuser_input.lower()forkeywordin["system(","exec(","eval("])),("敏感信息检查",notany(patterninuser_inputforpatternin["密码","token:","apikey"])),("编码检查",user_input.isprintable())]failures=[nameforname,passedinchecksifnotpassed]returnValidationResult(valid=len(failures)==0,failures=failures)@staticmethoddefsanitize_tool_parameters(params:Dict)->Dict:"""参数消毒"""sanitized={}forkey,valueinparams.items():ifisinstance(value,str):# 移除潜在的注入代码sanitized[key]=value.replace(";","").replace("`","").replace("$(","")else:sanitized[key]=valuereturnsanitized第2层:检测层(Detection)
classErrorDetector:"""错误检测器"""def__init__(self):self.error_patterns={"timeout":["timeout","timed out","请求超时","operation timeout","连接超时"],"rate_limit":["rate limit","quota","limit exceeded","API调用次数超限","429"],"authentication":["unauthorized","forbidden","invalid token","authentication failed","401","403"],"validation":["invalid parameter","bad request","validation failed","参数错误","400"],"server_error":["internal server error","server unavailable","服务器错误","500","503"],"llm_error":["content policy","cannot fulfill","refused","抱歉,我无法","根据我的使用条款"]}defclassify_error(self,error_message:str)->ErrorType:"""错误分类"""error_message_lower=error_message.lower()forerror_type,patternsinself.error_patterns.items():forpatterninpatterns:ifpattern.lower()inerror_message_lower:returnErrorType(type=error_type,pattern=pattern,confidence=0.9)returnErrorType(type="unknown",pattern="",confidence=0.0)defdetect_infinite_loop(self,execution_history:List[Dict])->bool:"""检测无限循环"""iflen(execution_history)<3:returnFalse# 检查最近三次操作是否相同recent_ops=[step.get("tool_name","")forstepinexecution_history[-3:]]iflen(set(recent_ops))==1andrecent_ops[0]:returnTrue# 检查状态是否重复recent_states=[hash(str(step.get("parameters",{})))forstepinexecution_history[-5:]]returnlen(set(recent_states))<3第3层:恢复层(Recovery)
classRecoveryStrategies:"""恢复策略集合"""def__init__(self,llm_client,fallback_tools:Dict):self.llm=llm_client self.fallback_tools=fallback_tools self.circuit_breakers={}defretry_with_backoff(self,func:Callable,max_retries:int=3,initial_delay:float=1.0)->Any:"""指数退避重试"""delay=initial_delayforattemptinrange(max_retries):try:returnfunc()exceptExceptionase:ifattempt==max_retries-1:raiseerror_type=self.detector.classify_error(str(e))# 对于某些错误不重试iferror_type.typein["authentication","validation"]:raiselogger.warning(f"重试{attempt+1}/{max_retries}:{str(e)}")time.sleep(delay)delay*=2# 指数退避deffallback_to_simpler_tool(self,failed_tool:str,original_params:Dict,context:Dict)->Any:"""降级到更简单的工具"""fallback_chain={"web_search":[("local_knowledge_base",0.8),("cached_search_results",0.6),("llm_general_knowledge",0.4)],"calculator":[("simple_math_parser",0.9),("llm_calculation",0.7),("approximate_estimation",0.5)],"weather_api":[("historical_weather",0.8),("seasonal_average",0.6),("manual_input",0.3)]}iffailed_toolnotinfallback_chain:returnNoneforfallback_tool,confidenceinfallback_chain[failed_tool]:iffallback_toolinself.fallback_tools:try:result=self.fallback_tools[fallback_tool](original_params)logger.info(f"使用降级工具{fallback_tool}(置信度:{confidence})")return{"result":result,"source":fallback_tool,"confidence":confidence,"is_fallback":True}except:continuereturnNonedefcircuit_breaker(self,tool_name:str,failure_threshold:int=5)->bool:"""熔断器模式"""iftool_namenotinself.circuit_breakers:self.circuit_breakers[tool_name]={"failures":0,"last_failure":None,"state":"closed"}cb=self.circuit_breakers[tool_name]ifcb["state"]=="open":# 检查是否应该进入半开状态if(cb["last_failure"]andtime.time()-cb["last_failure"]>60):# 60秒后重试cb["state"]="half-open"returnTruereturnFalseifcb["state"]=="half-open":# 半开状态只允许一次尝试cb["state"]="open"# 假设这次会失败returnTrue# closed状态,检查失败次数ifcb["failures"]>=failure_threshold:cb["state"]="open"cb["last_failure"]=time.time()logger.warning(f"熔断器触发:{tool_name}")returnFalsereturnTruedefupdate_circuit_state(self,tool_name:str,success:bool):"""更新熔断器状态"""iftool_namenotinself.circuit_breakers:returncb=self.circuit_breakers[tool_name]ifsuccess:cb["failures"]=0ifcb["state"]=="half-open":cb["state"]="closed"# 成功,关闭熔断器else:cb["failures"]+=1cb["last_failure"]=time.time()ifcb["state"]=="half-open":cb["state"]="open"# 失败,保持打开第4层:旁路层(Bypass)
classBypassStrategies:"""旁路策略"""@staticmethoddefsemantic_approximation(query:str,available_data:List)->str:"""语义近似:当无法获取精确数据时提供近似答案"""approximation_rules={r".*多少.*钱.*":["根据市场行情,类似产品价格在XXX-XXX元之间","价格因地区和时间而异,通常范围是...","我无法获取实时价格,但可以参考历史数据..."],r".*天气.*":["当前季节该地区通常天气是...","根据天气预报模型,预计...","可以参考邻近城市的天气情况..."],r".*时间.*":["通常需要XXX小时,具体取决于...","历史平均时间是...","根据类似情况估计..."]}forpattern,responsesinapproximation_rules.items():ifre.match(pattern,query):returnrandom.choice(responses)return"虽然无法提供精确答案,但根据一般情况..."@staticmethoddefstepwise_refinement(problem:str,max_steps:int=3)->List[str]:"""逐步细化:将复杂问题分解为简单问题"""refinement_prompt=f""" 将以下复杂问题分解为不超过{max_steps}个简单问题: 原问题:{problem}分解步骤(每个步骤应该是独立可回答的问题): 1. """# 调用LLM进行分解decomposed=llm_call(refinement_prompt)returndecomposed.split("\n")@staticmethoddefalternative_paths(main_path:List[str],available_tools:List[str])->List[List[str]]:"""生成替代执行路径"""alternatives=[]# 1. 工具替换路径tool_mapping={"web_search":["local_search","knowledge_base_query"],"calculator":["llm_calculation","rule_based_estimation"],"weather_api":["historical_data","seasonal_pattern"]}fortoolinmain_path:iftoolintool_mapping:foraltintool_mapping[tool]:ifaltinavailable_tools:alt_path=main_path.copy()alt_path[alt_path.index(tool)]=alt alternatives.append(alt_path)# 2. 顺序调整路径(如果顺序不重要)iflen(main_path)>1:forperminitertools.permutations(main_path):iflist(perm)!=main_path:alternatives.append(list(perm))returnalternatives[:5]# 返回前5个替代路径第5层:修复层(Repair)
classAutoRepairMechanisms:"""自动修复机制"""def__init__(self,llm_client):self.llm=llm_client self.repair_history=[]defrepair_invalid_response(self,invalid_response:str,expected_format:str)->str:"""修复无效的LLM响应"""repair_prompt=f""" 以下LLM响应不符合预期格式。请修复它。 预期格式:{expected_format}无效响应:{invalid_response}问题分析: 1. 格式错误(如缺少字段、错误分隔符) 2. 内容错误(如逻辑矛盾、事实错误) 3. 结构错误(如嵌套错误、类型错误) 修复后的响应: """try:repaired=self.llm.call(repair_prompt)self.repair_history.append({"original":invalid_response,"repaired":repaired,"timestamp":datetime.now()})returnrepairedexcept:# 如果修复失败,返回默认结构returnself._create_default_response(expected_format)defrecover_from_deadlock(self,agent_state:Dict,execution_history:List)->Dict:"""从死锁状态恢复"""# 策略1:回退到最后一个稳定状态stable_states=[stateforstateinexecution_historyifstate.get("status")=="success"]ifstable_states:last_stable=stable_states[-1]logger.info(f"回退到稳定状态:{last_stable.get('step_id')}")return{**agent_state,"current_step":last_stable.get("step_id"),"context":last_stable.get("context",{}),"recovery_action":"rollback_to_stable"}# 策略2:重置并重新开始logger.warning("无稳定状态可用,执行软重置")return{**agent_state,"current_step":0,"context":{},"execution_path":self._find_simpler_path(agent_state["goal"]),"recovery_action":"soft_reset"}deffix_data_inconsistency(self,data_sources:List[Dict])->Dict:"""修复数据不一致问题"""# 策略1:多数投票values=[source.get("value")forsourceindata_sources]ifvalues:value_counts=Counter(values)most_common=value_counts.most_common(1)ifmost_common[0][1]>len(values)/2:return{"value":most_common[0][0],"confidence":0.8}# 策略2:加权平均(对于数值)numeric_values=[]weights=[]forsourceindata_sources:try:val=float(source.get("value",0))numeric_values.append(val)weights.append(source.get("confidence",0.5))except:continueifnumeric_values:weighted_avg=np.average(numeric_values,weights=weights)return{"value":weighted_avg,"confidence":0.7}# 策略3:让LLM仲裁arbitration_prompt=f""" 以下数据源提供的信息不一致,请分析并给出最可能正确的值: 数据源:{json.dumps(data_sources,indent=2,ensure_ascii=False)}请综合考虑数据源的可信度、时间戳和内在逻辑。 输出格式:{{"value": "最可能的值", "reasoning": "推理过程"}} """returnself.llm.call(arbitration_prompt)