1. Erlang错误处理哲学

在咖啡厅里看到服务员打翻杯子时,其他服务员会立即清理现场并继续服务。这种"快速失败,优雅恢复"的理念正是Erlang错误处理的核心。Erlang/OTP通过进程隔离、监督树和错误传播三大法宝,让系统像训练有素的服务团队般可靠。

2. 基础错误处理机制

2.1 进程链接(link)实战

%% 咖啡师进程
start_barista() ->
    spawn_link(fun() ->
        receive
            brew_coffee -> 
                io:format("正在冲泡咖啡~n"),
                timer:sleep(1000);
            _ -> 
                %% 遇到未知消息时主动崩溃
                exit(unexpected_message)
        end
    end).

%% 收银员进程
start_cashier(BaristaPid) ->
    spawn_link(fun() ->
        process_flag(trap_exit, true),
        receive
            {'EXIT', BaristaPid, Reason} ->
                io:format("咖啡师~p异常退出,原因:~p~n", [BaristaPid, Reason]),
                restart_barista();
            _ -> ok
        end
    end).

注释说明:

  • spawn_link创建带有双向链接的进程
  • process_flag(trap_exit, true)使进程能够捕获退出信号
  • exit/1主动触发进程终止
  • 通过接收EXIT消息实现错误通知

2.2 监控(monitor)示例

start_coffee_machine_monitor() ->
    MachinePid = spawn(fun coffee_machine/0),
    Ref = monitor(process, MachinePid),
    receive
        {'DOWN', Ref, process, MachinePid, Reason} ->
            io:format("咖啡机~p故障,原因:~p~n", [MachinePid, Reason]),
            schedule_maintenance()
    end.

coffee_machine() ->
    receive
        brew -> 
            case rand:uniform(10) of
                1 -> exit(overheating);
                _ -> ok
            end
    after 5000 ->
        exit(idle_timeout)
    end.

技术亮点:

  • monitor创建单向监控关系
  • 支持监控非链接进程
  • DOWN消息包含详细的错误原因
  • 适用于临时性监控场景

3. OTP框架中的错误处理策略

3.1 Supervisor行为模式

-module(coffee_shop_sup).
-behaviour(supervisor).

-export([start_link/0]).
-export([init/1]).

start_link() ->
    supervisor:start_link({local, ?MODULE}, ?MODULE, []).

init([]) ->
    RestartStrategy = {one_for_one, 5, 60},
    BaristaSpec = {
        barista_worker,
        {barista, start_link, []},
        permanent, 
        5000, 
        worker,
        [barista]
    },
    CashierSpec = {
        cashier_worker,
        {cashier, start_link, []},
        transient,
        5000,
        worker,
        [cashier]
    },
    {ok, {RestartStrategy, [BaristaSpec, CashierSpec]}}.

参数解析:

  • one_for_one重启策略:只重启故障子进程
  • permanent表示必须始终存在的进程
  • transient进程仅在异常终止时重启
  • 5次/60秒的重启频率限制

3.2 GenServer错误处理模板

-module(order_processor).
-behaviour(gen_server).

handle_call({place_order, Items}, _From, State) ->
    case validate_order(Items) of
        {ok, Validated} ->
            {reply, {ok, process_order(Validated)}, State};
        {error, Reason} ->
            {stop, {invalid_order, Reason}, State}
    end;

terminate(Reason, _State) ->
    error_logger:error_msg("订单处理器异常终止,原因:~p~n", [Reason]),
    ok.

code_change(_OldVsn, State, _Extra) ->
    {ok, State}.

关键点说明:

  • gen_server提供标准化的错误处理流程
  • stop返回值触发有序关闭
  • terminate回调用于清理资源
  • code_change支持热代码升级

4. 高级错误处理模式

4.1 错误内核模式

start_error_kernel() ->
    KernelPid = spawn(fun() ->
        process_flag(trap_exit, true),
        Services = [],
        loop(Services)
    end),
    register(error_kernel, KernelPid).

loop(Services) ->
    receive
        {start_service, Mod} ->
            Pid = spawn_link(Mod, start, []),
            loop([{Pid, Mod} | Services]);
        {'EXIT', Pid, Reason} ->
            case lists:keyfind(Pid, 1, Services) of
                {Pid, Mod} ->
                    error_logger:error_msg("服务~p(~p)崩溃,原因:~p~n", [Mod, Pid, Reason]),
                    NewPid = spawn_link(Mod, start, []),
                    NewServices = lists:keyreplace(Pid, 1, Services, {NewPid, Mod}),
                    loop(NewServices);
                false ->
                    loop(Services)
            end
    end.

架构特点:

  • 中央管理进程监控所有服务
  • 动态服务注册与重启
  • 错误隔离与服务恢复解耦

4.2 断路器模式实现

-module(circuit_breaker).
-export([start/2, call/2]).

start(ServiceMod, Threshold) ->
    spawn(fun() -> 
        State = {closed, 0},
        monitor_service(ServiceMod, Threshold, State) 
    end).

monitor_service(Mod, Threshold, {State, FailCount}) ->
    receive
        {request, From, Msg} when State == closed ->
            case Mod:handle(Msg) of
                {error, Reason} when FailCount+1 >= Threshold ->
                    From ! {error, circuit_open},
                    timer:send_after(5000, reset),
                    monitor_service(Mod, Threshold, {open, 0});
                {error, _} ->
                    From ! {error, service_unavailable},
                    monitor_service(Mod, Threshold, {closed, FailCount+1});
                Result ->
                    From ! Result,
                    monitor_service(Mod, Threshold, {closed, 0})
            end;
        reset when State == open ->
            monitor_service(Mod, Threshold, {closed, 0})
    end.

功能亮点:

  • closed/open两种状态自动切换
  • 错误计数阈值控制熔断
  • 自动复位机制
  • 服务降级处理