G t = R t + 1 + R t + 2 + R t + 3 + ⋯ + R T G t = R t + 1 + γ R t + 2 + γ 2 R t + 3 + ⋯ = ∑ k = 0 ∞ γ k R t + k + 1 , 0 ≤ γ ≤ 1 G t = R t + 1 + γ R t + 2 + γ 2 R t + 3 + ⋯ = R t + 1 + γ ( R t + 2 + γ R t + 3 + ⋯ ) = R t + 1 + γ G t + 1 \begin{aligned} G_t&=R_{t+1}+R_{t+2}+R_{t+3}+\cdots+R_T \\ G_t&=R_{t+1}+\gamma R_{t+2}+\gamma^{2} R_{t+3}+\cdots=\sum_{k=0}^\infin \gamma^k R_{t+k+1}, 0≤\gamma≤1 \\ G_t &=R_{t+1}+\gamma R_{t+2}+\gamma^{2} R_{t+3}+\cdots \\ &=R_{t+1}+\gamma (R_{t+2}+\gamma R_{t+3}+ \cdots)\\ &=R_{t+1}+\gamma G_{t+1} \end{aligned} GtGtGt=Rt+1+Rt+2+Rt+3+⋯+RT=Rt+1+γRt+2+γ2Rt+3+⋯=k=0∑∞γkRt+k+1,0≤γ≤1=Rt+1+γRt+2+γ2Rt+3+⋯=Rt+1+γ(Rt+2+γRt+3+⋯)=Rt+1+γGt+1
v π ( s ) = E [ G t ∣ S t = s ] = E π [ ∑ k = 0 ∞ γ k R t + k + 1 ∣ S t = s ] , ∀ s ∈ S v π ( s ) = ∑ a π ( a ∣ s ) q π ( s , a ) q π ( s , a ) = E [ G t ∣ S t = s , A t = a ] = E π [ ∑ k = 0 ∞ γ k R t + k + 1 ∣ S t = s , A t = a ] q π ( s , a ) = ∑ s ′ , r p ( s ′ , r ∣ s , a ) [ r + γ v π ( s ′ ) ] \begin{aligned} v_\pi (s) &= \mathbb{E} [G_t | S_t = s] = \mathbb{E}_\pi [\sum_{k=0} ^\infin \gamma^k R_{t+k+1} | S_t=s], \forall s \in S \\ v_\pi (s) &= \sum_a \pi(a|s) q_\pi (s,a) \\ q_\pi (s,a)&=\mathbb{E} [G_t|S_t=s,A_t=a]=\mathbb{E}_\pi [\sum_{k=0}^\infin \gamma^k R_{t+k+1} | S_t=s,A_t=a] \\ q_\pi (s,a)&= \sum_{s^{'} ,r} p(s^{'},r|s,a)[r+\gamma v_\pi (s^{'})] \\ \end{aligned} vπ(s)vπ(s)qπ(s,a)qπ(s,a)=E[Gt∣St=s]=Eπ[k=0∑∞γkRt+k+1∣St=s],∀s∈S=a∑π(a∣s)qπ(s,a)=E[Gt∣St=s,At=a]=Eπ[k=0∑∞γkRt+k+1∣St=s,At=a]=s′,r∑p(s′,r∣s,a)[r+γvπ(s′)]
G t = R t + 1 + γ R t + 2 + γ 2 R t + 3 ⋯ = R t + 1 + γ G t + 1 V π ( s ) = E π [ G t ∣ S t = s ] = E π [ R t + 1 + γ G t + 1 ∣ S t = s ] = ∑ a π ( a ∣ s ) ∑ s ′ ∑ r p ( s ′ , r ∣ s , a ) [ r + γ E π [ G t + 1 ∣ S t + 1 = s ′ ] = ∑ a π ( a ∣ s ) ∑ s ′ , r p ( s ′ , r ∣ s , a ) [ r + γ v π ( s ′ ) ] (此式为 v π 的贝尔曼方程) q π ( s , a ) = E π [ G t ∣ S t = s , A t = a ] = E π [ R t + 1 + γ G t + 1 ∣ S t = s , A t = a ] = ∑ s ′ , r p ( s ′ , r ∣ s , a ) [ r + γ ∑ a ′ π ( a ′ ∣ s ′ ) q π ( s ′ , a ′ ) ] (此为 q π 的贝尔曼方程) \begin{aligned} G_t&=R_{t+1}+\gamma R_{t+2}+\gamma^2 R_{t+3}\cdots=R_{t+1}+\gamma G_{t+1}\\ V_\pi (s)&=\mathbb{E}_\pi[G_t|S_t=s]=\mathbb{E}_\pi[R_{t+1}+\gamma G_{t+1}|S_t=s]\\ &=\sum_a \pi(a|s) \sum_{s^{'}} \sum_r p(s^{'},r|s,a)[r+\gamma \mathbb{E}_\pi [G_{t+1}|S_{t+1}=s^{'}]\\ &=\sum_a \pi(a|s) \sum_{s^{'},r} p(s^{'},r|s,a)[r+\gamma v_\pi(s^{'})] \text{(此式为$v_\pi$的贝尔曼方程)}\\ q_\pi (s,a)&=\mathbb{E}_\pi [G_t|S_t=s,A_t=a]=\mathbb{E}_\pi[R_{t+1}+\gamma G_{t+1}|S_t=s,A_t=a]\\ &=\sum_{s^{'},r} p(s^{'},r|s,a)[r+\gamma \sum_{a^{'}} \pi(a^{'}|s^{'})q_\pi(s^{'},a^{'})] \text{(此为$q_\pi$的贝尔曼方程)} \end{aligned} GtVπ(s)qπ(s,a)=Rt+1+γRt+2+γ2Rt+3⋯=Rt+1+γGt+1=Eπ[Gt∣St=s]=Eπ[Rt+1+γGt+1∣St=s]=a∑π(a∣s)s′∑r∑p(s′,r∣s,a)[r+γEπ[Gt+1∣St+1=s′]=a∑π(a∣s)s′,r∑p(s′,r∣s,a)[r+γvπ(s′)](此式为vπ的贝尔曼方程)=Eπ[Gt∣St=s,At=a]=Eπ[Rt+1+γGt+1∣St=s,At=a]=s′,r∑p(s′,r∣s,a)[r+γa′∑π(a′∣s′)qπ(s′,a′)](此为qπ的贝尔曼方程)
v ∗ ( s ) = m a x π v π ( s ) v ∗ ( s ) = m a x a ∈ A ( s ) q π ∗ ( s , a ) = m a x a E π ∗ [ G t ∣ S t = s , A t = a ] = m a x a E π ∗ [ R t + 1 + γ G t + 1 ∣ S t = s , A t = a ] = m a x a E [ R t + 1 + γ v ∗ ( S t + 1 ) ∣ S t = s , A t = a ] = m a x a ∑ s ′ , r p ( s ′ , r ∣ s , a ) [ r + γ v ∗ ( s ′ ) ] ( v ∗ 的贝尔曼最优方程) q ∗ ( s , a ) = m a x π q π ( s , a ) q ∗ ( s , a ) = E [ R t + 1 + γ m a x a ′ q ∗ ( S t + 1 , a ′ ) ∣ S t = s , A t = a ] = ∑ s ′ , r p ( s ′ , r ∣ s , a ) [ r + γ m a x a ′ q ∗ ( s ′ , a ′ ) ] ( q ∗ 的贝尔曼最优方程) \begin{aligned} v_*(s)&=max_\pi v_\pi(s)\\ v_*(s)&=max_{a \in A(s)}q_{\pi_*}(s,a)=max_a \mathbb{E}_{\pi_*}[G_t|S_t=s,A_t=a]\\ &=max_a \mathbb{E}_{\pi_*}[R_{t+1}+\gamma G_{t+1}|S_t=s,A_t=a]\\ &=max_a \mathbb{E}[R_{t+1}+\gamma v_*(S_{t+1})|S_t=s,A_t=a]\\ &=max_a \sum_{s^{'},r}p(s^{'},r|s,a)[r+\gamma v_*(s^{'})] \text{($v_*$的贝尔曼最优方程)}\\ q_*(s,a)&=max_\pi q_\pi(s,a)\\ q_*(s,a)&=\mathbb{E}[R_{t+1}+\gamma max_{a^{'}} q_*(S_{t+1},a^{'})|S_t=s,A_t=a]\\ &=\sum_{s^{'},r} p(s^{'},r|s,a)[r+\gamma max_{a^{'}} q_*(s^{'},a^{'})] \text{($q_*$的贝尔曼最优方程)} \end{aligned} v∗(s)v∗(s)q∗(s,a)q∗(s,a)=maxπvπ(s)=maxa∈A(s)qπ∗(s,a)=maxaEπ∗[Gt∣St=s,At=a]=maxaEπ∗[Rt+1+γGt+1∣St=s,At=a]=maxaE[Rt+1+γv∗(St+1)∣St=s,At=a]=maxas′,r∑p(s′,r∣s,a)[r+γv∗(s′)](v∗的贝尔曼最优方程)=maxπqπ(s,a)=E[Rt+1+γmaxa′q∗(St+1,a′)∣St=s,At=a]=s′,r∑p(s′,r∣s,a)[r+γmaxa′q∗(s′,a′)](q∗的贝尔曼最优方程)
答案解析请参考强化学习(第二版)第三章答案