1. Erlang错误处理哲学
在咖啡厅里看到服务员打翻杯子时,其他服务员会立即清理现场并继续服务。这种"快速失败,优雅恢复"的理念正是Erlang错误处理的核心。Erlang/OTP通过进程隔离、监督树和错误传播三大法宝,让系统像训练有素的服务团队般可靠。
2. 基础错误处理机制
2.1 进程链接(link)实战
%% 咖啡师进程
start_barista() ->
    spawn_link(fun() ->
        receive
            brew_coffee -> 
                io:format("正在冲泡咖啡~n"),
                timer:sleep(1000);
            _ -> 
                %% 遇到未知消息时主动崩溃
                exit(unexpected_message)
        end
    end).
%% 收银员进程
start_cashier(BaristaPid) ->
    spawn_link(fun() ->
        process_flag(trap_exit, true),
        receive
            {'EXIT', BaristaPid, Reason} ->
                io:format("咖啡师~p异常退出,原因:~p~n", [BaristaPid, Reason]),
                restart_barista();
            _ -> ok
        end
    end).
注释说明:
- spawn_link创建带有双向链接的进程
- process_flag(trap_exit, true)使进程能够捕获退出信号
- exit/1主动触发进程终止
- 通过接收EXIT消息实现错误通知
2.2 监控(monitor)示例
start_coffee_machine_monitor() ->
    MachinePid = spawn(fun coffee_machine/0),
    Ref = monitor(process, MachinePid),
    receive
        {'DOWN', Ref, process, MachinePid, Reason} ->
            io:format("咖啡机~p故障,原因:~p~n", [MachinePid, Reason]),
            schedule_maintenance()
    end.
coffee_machine() ->
    receive
        brew -> 
            case rand:uniform(10) of
                1 -> exit(overheating);
                _ -> ok
            end
    after 5000 ->
        exit(idle_timeout)
    end.
技术亮点:
- monitor创建单向监控关系
- 支持监控非链接进程
- DOWN消息包含详细的错误原因
- 适用于临时性监控场景
3. OTP框架中的错误处理策略
3.1 Supervisor行为模式
-module(coffee_shop_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
start_link() ->
    supervisor:start_link({local, ?MODULE}, ?MODULE, []).
init([]) ->
    RestartStrategy = {one_for_one, 5, 60},
    BaristaSpec = {
        barista_worker,
        {barista, start_link, []},
        permanent, 
        5000, 
        worker,
        [barista]
    },
    CashierSpec = {
        cashier_worker,
        {cashier, start_link, []},
        transient,
        5000,
        worker,
        [cashier]
    },
    {ok, {RestartStrategy, [BaristaSpec, CashierSpec]}}.
参数解析:
- one_for_one重启策略:只重启故障子进程
- permanent表示必须始终存在的进程
- transient进程仅在异常终止时重启
- 5次/60秒的重启频率限制
3.2 GenServer错误处理模板
-module(order_processor).
-behaviour(gen_server).
handle_call({place_order, Items}, _From, State) ->
    case validate_order(Items) of
        {ok, Validated} ->
            {reply, {ok, process_order(Validated)}, State};
        {error, Reason} ->
            {stop, {invalid_order, Reason}, State}
    end;
terminate(Reason, _State) ->
    error_logger:error_msg("订单处理器异常终止,原因:~p~n", [Reason]),
    ok.
code_change(_OldVsn, State, _Extra) ->
    {ok, State}.
关键点说明:
- gen_server提供标准化的错误处理流程
- stop返回值触发有序关闭
- terminate回调用于清理资源
- code_change支持热代码升级
4. 高级错误处理模式
4.1 错误内核模式
start_error_kernel() ->
    KernelPid = spawn(fun() ->
        process_flag(trap_exit, true),
        Services = [],
        loop(Services)
    end),
    register(error_kernel, KernelPid).
loop(Services) ->
    receive
        {start_service, Mod} ->
            Pid = spawn_link(Mod, start, []),
            loop([{Pid, Mod} | Services]);
        {'EXIT', Pid, Reason} ->
            case lists:keyfind(Pid, 1, Services) of
                {Pid, Mod} ->
                    error_logger:error_msg("服务~p(~p)崩溃,原因:~p~n", [Mod, Pid, Reason]),
                    NewPid = spawn_link(Mod, start, []),
                    NewServices = lists:keyreplace(Pid, 1, Services, {NewPid, Mod}),
                    loop(NewServices);
                false ->
                    loop(Services)
            end
    end.
架构特点:
- 中央管理进程监控所有服务
- 动态服务注册与重启
- 错误隔离与服务恢复解耦
4.2 断路器模式实现
-module(circuit_breaker).
-export([start/2, call/2]).
start(ServiceMod, Threshold) ->
    spawn(fun() -> 
        State = {closed, 0},
        monitor_service(ServiceMod, Threshold, State) 
    end).
monitor_service(Mod, Threshold, {State, FailCount}) ->
    receive
        {request, From, Msg} when State == closed ->
            case Mod:handle(Msg) of
                {error, Reason} when FailCount+1 >= Threshold ->
                    From ! {error, circuit_open},
                    timer:send_after(5000, reset),
                    monitor_service(Mod, Threshold, {open, 0});
                {error, _} ->
                    From ! {error, service_unavailable},
                    monitor_service(Mod, Threshold, {closed, FailCount+1});
                Result ->
                    From ! Result,
                    monitor_service(Mod, Threshold, {closed, 0})
            end;
        reset when State == open ->
            monitor_service(Mod, Threshold, {closed, 0})
    end.
功能亮点:
- closed/open两种状态自动切换
- 错误计数阈值控制熔断
- 自动复位机制
- 服务降级处理
评论