# Theory Value_Iteration

(* Author: Maximilian SchÃ¤ffeler *)

theory Value_Iteration
imports "MDP-Rewards.MDP_reward"
begin

context MDP_att_â
begin

section â¹Value Iterationâº
text â¹
In the previous sections we derived that repeated application of @{const "ââ©b"} to any bounded
function from states to the reals converges to the optimal value of the MDP @{const "Î½â©b_opt"}.

We can turn this procedure into an algorithm that computes not only an approximation of
@{const "Î½â©b_opt"} but also a policy that is arbitrarily close to optimal.

Most of the proofs rely on the assumption that the supremum in @{const "ââ©b"} can always be attained.
âº

text â¹
The following lemma shows that the relation we use to prove termination of the value iteration
algorithm decreases in each step.
In essence, the distance of the estimate to the optimal value decreases by a factor of at
least @{term l} per iteration.âº

lemma vi_rel_dec:
shows "âlog (1 / l) (dist (ââ©b v) Î½â©b_opt) - câ < âlog (1 / l) (dist v Î½â©b_opt) - câ"
proof -
have "log (1 / l) (dist (ââ©b v) Î½â©b_opt) - c â¤ log (1 / l) (l * dist v Î½â©b_opt) - c"
by (auto simp: assms less_le intro: log_le)
also have "â¦ = log (1 / l) l + log (1/l) (dist v Î½â©b_opt) - c"
using assms disc_lt_one
by (auto simp: less_le intro!: log_mult)
also have "â¦ = -(log (1 / l) (1/l)) + (log (1/l) (dist v Î½â©b_opt)) - c"
using assms disc_lt_one
by (subst log_inverse[symmetric]) (auto simp: less_le right_inverse_eq)
also have "â¦ = (log (1/l) (dist v Î½â©b_opt)) - 1 - c"
using assms order.strict_implies_not_eq[OF disc_lt_one]
by (auto intro!: log_eq_one neq_le_trans)
finally have "log (1 / l) (dist (ââ©b v) Î½â©b_opt) - c â¤ log (1 / l) (dist v Î½â©b_opt) - 1 - c" .
thus ?thesis
by linarith
qed

proof -
by metis
show ?thesis
using mult_right_mono[of l 1] disc_lt_one
by (fastforce intro!: order.trans[OF le2] order.trans[OF le1])
qed

abbreviation "term_measure â¡ (Î»(eps, v).
if v = Î½â©b_opt â¨ l = 0
then 0
else nat (ceiling (log (1/l) (dist v Î½â©b_opt) - log (1/l) (eps * (1-l) / (8 * l)))))"

function value_iteration :: "real â ('s ââ©b real) â ('s ââ©b real)" where
"value_iteration eps v =
(if 2 * l * dist v (ââ©b v) < eps * (1-l) â¨ eps â¤ 0 then ââ©b v else value_iteration eps (ââ©b v))"
by auto

termination
proof (relation "Wellfounded.measure term_measure", (simp; fail), cases "l = 0")
case False
fix eps v
assume h: "Â¬ (2 * l * dist v (ââ©b v) < eps * (1 - l) â¨ eps â¤ 0)"
show "((eps, ââ©b v), eps, v) â Wellfounded.measure term_measure"
proof -
have gt_zero[simp]: "l â  0" "eps > 0" and dist_ge: "eps * (1 - l) â¤ dist v (ââ©b v) * (2 * l)"
using h
by (auto simp: algebra_simps)
using h
by force
have "log (1 / l) (eps * (1 - l) / (8 * l)) < log (1 / l) (dist v Î½â©b_opt)"
proof (intro log_less)
show "1 < 1 / l"
by (auto intro!: mult_imp_less_div_pos intro: neq_le_trans)
show "0 < eps * (1 - l) / (8 * l)"
by (auto intro!: mult_imp_less_div_pos intro: neq_le_trans)
show "eps * (1 - l) / (8 * l) < dist v Î½â©b_opt"
using dist_pos_lt[OF v_not_opt] dist_ââ©b_lt_dist_opt[of v] gt_zero zero_le_disc
by (intro mult_imp_div_pos_less le_less_trans[OF dist_ge], argo+)
qed
thus ?thesis
using vi_rel_dec h
by auto
qed
qed auto

text â¹
The distance between an estimate for the value and the optimal value can be bounded with respect to
the distance between the estimate and the result of applying it to @{const ââ©b}
âº
lemma contraction_â_dist: "(1 - l) * dist v Î½â©b_opt â¤ dist v (ââ©b v)"
using contraction_dist contraction_â disc_lt_one zero_le_disc
by fastforce

assumes "eps > 0" "2 * l * dist v (ââ©b v) < eps * (1-l)"
proof -
have "dist v Î½â©b_opt â¤ dist v (ââ©b v) / (1 - l)"
using contraction_â_dist
hence "2 * l * dist v Î½â©b_opt â¤ 2 * l * (dist v (ââ©b v) / (1 - l))"
using contraction_â_dist assms mult_le_cancel_left_pos[of "2 * l"]
by (fastforce intro!: mult_left_mono[of _ _ "2 * l"])
hence "2 * l * dist v Î½â©b_opt < eps"
by (auto simp: assms(2) pos_divide_less_eq intro: order.strict_trans1)
hence "dist v Î½â©b_opt * l < eps / 2"
by argo
hence "l * dist v Î½â©b_opt < eps / 2"
by (auto simp: algebra_simps)
by auto
qed

text â¹
The estimates above allow to give a bound on the error of @{const value_iteration}.
âº
declare value_iteration.simps[simp del]

lemma value_iteration_error:
assumes "eps > 0"
shows "dist (value_iteration eps v) Î½â©b_opt < eps / 2"
by (induction eps v rule: value_iteration.induct) auto

text â¹
After the value iteration terminates, one can easily obtain a stationary deterministic
epsilon-optimal policy.

Such a policy does not exist in general, attainment of the supremum in @{const ââ©b} is required.
âº
definition "find_policy (v :: 's ââ©b real) s = arg_max_on (Î»a. Lâ©a a v s) (A s)"

definition "vi_policy eps v = find_policy (value_iteration eps v)"

text â¹
We formalize the attainment of the supremum using a predicate @{const has_arg_max}.
âº

abbreviation "vi u n â¡ (ââ©b ^^n) u"

assumes "u â¤ v" shows "vi u n â¤ vi v n"
by (induction n) auto

lemma
assumes "vi v (Suc n) â¤ vi v n"
shows "vi v (Suc n + m) â¤ vi v (n + m)"
proof -
have "vi v (Suc n + m) = vi (vi v (Suc n)) m"
also have "... â¤ vi (vi v n) m"
by auto
also have "... = vi v (n + m)"
finally show ?thesis .
qed

lemma
assumes "vi v n â¤ vi v (Suc n)"
shows "vi v (n + m) â¤ vi v (Suc n + m)"
proof -
have "vi v (n + m) â¤ vi (vi v n) m"
also have "â¦ â¤ vi v (Suc n + m)"
finally show ?thesis .
qed

(* 6.3.1 *)
(* a) *)

lemma "(Î»n. dist (vi v (Suc n)) (vi v n)) â¢ 0"
using thm_6_3_1_b_aux[of v]
by (auto simp only: dist_commute[of "((ââ©b ^^ Suc _) v)"])

end

context MDP_att_â
begin

text â¹
The error of the resulting policy is bounded by the distance from its value to the value computed
by the value iteration plus the error in the value iteration itself.
We show that both are less than @{term "eps / 2"} when the algorithm terminates.
âº
lemma find_policy_error_bound:
assumes "eps > 0" "2 * l * dist v (ââ©b v) < eps * (1-l)"
proof -
let ?d = "mk_dec_det (find_policy (ââ©b v))"
let ?p = "mk_stationary ?d"
(* shorter proof:     by (auto simp: arg_max_SUP[OF find_policy_QR_is_arg_max] ââ©b_split.rep_eq â_split_def )*)
have L_eq_ââ©b: "L (mk_dec_det (find_policy v)) v = ââ©b v" for v
unfolding find_policy_def
proof (intro antisym)
show "L (mk_dec_det (Î»s. arg_max_on (Î»a. Lâ©a a v s) (A s))) v â¤ ââ©b v"
using Sup_att has_arg_max_arg_max abs_L_le
unfolding ââ©b.rep_eq â_eq_SUP_det less_eq_bfun_def arg_max_on_def is_dec_det_def max_L_ex_def
by (auto intro!: cSUP_upper bounded_imp_bdd_above boundedI[of _ "râ©M + l * norm v"])
next
show "ââ©b v â¤ L (mk_dec_det (Î»s. arg_max_on (Î»a. Lâ©a a v s) (A s))) v"
using Sup_att ex_dec_det
by (auto intro!: cSUP_least app_arg_max_ge simp: L_eq_Lâ©a_det max_L_ex_def is_dec_det_def)
qed
using L_Î½_fix
by force
using dist_triangle
by blast
also have "â¦ â¤ l *  dist (Î½â©b ?p) (ââ©b v) + l * dist (ââ©b v) v"
using contraction_â contraction_L
by auto
hence "dist (Î½â©b ?p) (ââ©b v) * (1 - l) â¤ l * dist (ââ©b v) v"
by argo
hence  "2 * dist (Î½â©b ?p) (ââ©b v) * (1-l) â¤ 2 * (l * dist (ââ©b v) v)"
using zero_le_disc mult_left_mono
by auto
also have "â¦ â¤ eps * (1-l)"
using assms
by (auto intro!: mult_left_mono simp: dist_commute pos_divide_le_eq)
finally have "2 * dist (Î½â©b ?p) (ââ©b v) * (1 - l) â¤ eps * (1 - l)" .
using disc_lt_one mult_right_le_imp_le
by auto
by fastforce
using dist_triangle
by blast
ultimately show ?thesis
by auto
qed

lemma vi_policy_opt:
assumes "0 < eps"
unfolding vi_policy_def
using assms
proof (induction eps v rule: value_iteration.induct)
case (1 v)
then show ?case
using find_policy_error_bound
by (subst value_iteration.simps) auto
qed

lemma lemma_6_3_1_d:
assumes "eps > 0"
assumes "2 * l * dist (vi v (Suc n)) (vi v n) < eps * (1-l)"
shows "dist (vi v (Suc n)) Î½â©b_opt < eps / 2"

end

context MDP_act begin

definition "find_policy' (v :: 's ââ©b real) s = arb_act (opt_acts v s)"

definition "vi_policy' eps v = find_policy' (value_iteration eps v)"

lemma find_policy'_error_bound:
assumes "eps > 0" "2 * l * dist v (ââ©b v) < eps * (1-l)"
proof -
let ?d = "mk_dec_det (find_policy' (ââ©b v))"
let ?p = "mk_stationary ?d"
have L_eq_ââ©b: "L (mk_dec_det (find_policy' v)) v = ââ©b v" for v
unfolding find_policy'_def
using L_Î½_fix
by force
using dist_triangle
by blast
also have "â¦ â¤ l *  dist (Î½â©b ?p) (ââ©b v) + l * dist (ââ©b v) v"
using contraction_â contraction_L
by auto
hence "dist (Î½â©b ?p) (ââ©b v) * (1 - l) â¤ l * dist (ââ©b v) v"
by argo
hence  "2 * dist (Î½â©b ?p) (ââ©b v) * (1-l) â¤ 2 * (l * dist (ââ©b v) v)"
using zero_le_disc mult_left_mono
by auto
also have "â¦ â¤ eps * (1-l)"
using assms
by (auto intro!: mult_left_mono simp: dist_commute pos_divide_le_eq)
finally have "2 * dist (Î½â©b ?p) (ââ©b v) * (1 - l) â¤ eps * (1 - l)".
using disc_lt_one mult_right_le_imp_le
by auto
by fastforce
using dist_triangle
by blast
ultimately show ?thesis
by auto
qed

lemma vi_policy'_opt:
assumes "eps > 0" "l > 0"
unfolding vi_policy'_def
using assms
proof (induction eps v rule: value_iteration.induct)
case (1 v)
then show ?case
using find_policy'_error_bound
by (subst value_iteration.simps) auto
qed

end
end

# Theory Policy_Iteration

(* Author: Maximilian SchÃ¤ffeler *)

theory Policy_Iteration
imports "MDP-Rewards.MDP_reward"

begin

section â¹Policy Iterationâº
text â¹
The Policy Iteration algorithms provides another way to find optimal policies under the expected
total reward criterion.
It differs from Value Iteration in that it continuously improves an initial guess for an optimal
decision rule. Its execution can be subdivided into two alternating steps: policy evaluation and
policy improvement.

Policy evaluation means the calculation of the value of the current decision rule.

During the improvement phase, we choose the decision rule with the maximum value for L,
while we prefer to keep the old action selection in case of ties.
âº

context MDP_att_â begin
definition "policy_eval d = Î½â©b (mk_stationary_det d)"
end

context MDP_act
begin

definition "policy_improvement d v s = (
if is_arg_max (Î»a. Lâ©a a (apply_bfun v) s) (Î»a. a â A s) (d s)
then d s
else arb_act (opt_acts v s))"

definition "policy_step d = policy_improvement d (policy_eval d)"

(* todo: move check is_dec_det outside the recursion *)
function policy_iteration :: "('s â 'a) â ('s â 'a)" where
"policy_iteration d = (
let d' = policy_step d in
if d = d' â¨ Â¬is_dec_det d then d else policy_iteration d')"
by auto

text â¹
The policy iteration algorithm as stated above does require that the supremum in @{const ââ©b} is
always attained.
âº

text â¹
Each policy improvement returns a valid decision rule.
âº
lemma is_dec_det_pi: "is_dec_det (policy_improvement d v)"
unfolding policy_improvement_def is_dec_det_def is_arg_max_def
by (auto simp: some_opt_acts_in_A)

unfolding policy_improvement_def is_dec_det_def
using some_opt_acts_in_A
by auto

lemma policy_improvement_improving:
shows "Î½_improving v (mk_dec_det (policy_improvement d v))"
proof -
have "ââ©b v x = L (mk_dec_det (policy_improvement d v)) v x" for x
using is_opt_act_some
by (fastforce simp: thm_6_2_10_a_aux' L_eq_Lâ©a_det is_opt_act_def policy_improvement_def
arg_max_SUP[symmetric, of _ _ "(policy_improvement d v x)"] )
thus ?thesis
using policy_improvement_is_dec_det assms
by (auto simp: Î½_improving_alt)
qed

lemma eval_policy_step_L:
assumes "is_dec_det d"
shows "L (mk_dec_det (policy_step d)) (policy_eval d) = ââ©b (policy_eval d)"
unfolding policy_step_def
using assms

text â¹ The sequence of policies generated by policy iteration has monotonically increasing
discounted reward.âº
lemma policy_eval_mon:
assumes "is_dec_det d"
shows "policy_eval d â¤ policy_eval (policy_step d)"
proof -
let ?d' = "mk_dec_det (policy_step d)"
let ?dp = "mk_stationary_det d"
let ?P = "ât. l ^ t *â©R ð«â©1 ?d' ^^ t"

have "L (mk_dec_det d) (policy_eval d) â¤ L ?d' (policy_eval d)"
using assms
hence "policy_eval d â¤ L ?d' (policy_eval d)"
using L_Î½_fix policy_eval_def
by auto
unfolding policy_eval_def L_def
by auto
by (simp add: blinfun.diff_left diff_le_eq scaleR_blinfun.rep_eq)
using lemma_6_1_2_b
by auto
by (metis (mono_tags, lifting))
thus ?thesis
unfolding policy_eval_def
by (auto simp: Î½_stationary)
qed

text â¹
If policy iteration terminates, i.e. @{term "d = policy_step d"}, then it does so with optimal value.
âº
lemma policy_step_eq_imp_opt:
assumes "is_dec_det d" "d = policy_step d"
proof -
have "policy_eval d = ââ©b (policy_eval d)"
unfolding policy_eval_def
using L_Î½_fix assms eval_policy_step_L[unfolded policy_eval_def]
by fastforce
thus ?thesis
unfolding policy_eval_def
using â_fix_imp_opt
by blast
qed

end

text â¹We prove termination of policy iteration only if both the state and action sets are finite.âº
locale MDP_PI_finite = MDP_act A K r l arb_act
for
A and
K :: "'s ::countable Ã 'a ::countable â 's pmf" and r l arb_act +
assumes fin_states: "finite (UNIV :: 's set)" and fin_actions: "âs. finite (A s)"
begin

text â¹If the state and action sets are both finite,
then so is the set of deterministic decision rules @{const "Dâ©D"}âº
proof -
let ?set = "{d. âx :: 's. (x â UNIV â¶ d x â (âs. A s)) â§ (x â UNIV â¶ d x = undefined)}"
have "finite (âs. A s)"
using fin_actions fin_states by blast
hence "finite ?set"
using fin_states
by (fastforce intro: finite_set_of_finite_funs)
unfolding is_dec_det_def
by auto
ultimately show ?thesis
using finite_subset
by auto
qed

lemma finite_rel: "finite {(u, v). is_dec_det u â§ is_dec_det v â§ Î½â©b (mk_stationary_det u) >
proof-
have aux: "finite {(u, v). is_dec_det u â§ is_dec_det v}"
by auto
show ?thesis
by (auto intro: finite_subset[OF _ aux])
qed

text â¹
This auxiliary lemma shows that policy iteration terminates if no improvement to the value of
the policy could be made, as then the policy remains unchanged.
âº
lemma eval_eq_imp_policy_eq:
assumes "policy_eval d = policy_eval (policy_step d)" "is_dec_det d"
shows "d = policy_step d"
proof -
have "policy_eval d s = policy_eval (policy_step d) s" for s
using assms
by auto
have "policy_eval d = L (mk_dec_det d) (policy_eval (policy_step d))"
unfolding policy_eval_def
using L_Î½_fix
by (auto simp: assms(1)[symmetric, unfolded policy_eval_def])
hence "policy_eval d = ââ©b (policy_eval d)"
by (metis L_Î½_fix policy_eval_def assms eval_policy_step_L)
hence "L (mk_dec_det d) (policy_eval d) s = ââ©b (policy_eval d) s" for s
using â¹policy_eval d = L (mk_dec_det d) (policy_eval (policy_step d))âº assms(1) by auto
hence "is_arg_max (Î»a. Lâ©a a (Î½â©b (mk_stationary (mk_dec_det d))) s) (Î»a. a â A s) (d s)" for s
intro!: SUP_is_arg_max boundedI[of _ "râ©M + l * norm _"] bounded_imp_bdd_above)
thus ?thesis
unfolding policy_eval_def policy_step_def policy_improvement_def
by auto
qed

text â¹
We are now ready to prove termination in the context of finite state-action spaces.
Intuitively, the algorithm terminates as there are only finitely many decision rules,
and in each recursive call the value of the decision rule increases.
âº
termination policy_iteration
using finite_rel
by (auto intro!: finite_acyclic_wf acyclicI_order)
next
fix d x
assume h: "x = policy_step d" "Â¬ (d = x â¨ Â¬ is_dec_det d)"
have "is_dec_det d â¹ Î½â©b (mk_stationary_det d) â¤ Î½â©b (mk_stationary_det (policy_step d))"
using policy_eval_mon
hence "is_dec_det d â¹ d â  policy_step d â¹
using eval_eq_imp_policy_eq policy_eval_def
by (force intro!: order.not_eq_order_implies_strict)
using is_dec_det_pi policy_step_def h
by auto
qed

text â¹
The termination proof gives us access to the induction rule/simplification lemmas associated
with the @{const policy_iteration} definition.
Thus we can prove that the algorithm finds an optimal policy.
âº

lemma is_dec_det_pi': "d â Dâ©D â¹ is_dec_det (policy_iteration d)"
using is_dec_det_pi
by (induction d rule: policy_iteration.induct) (auto simp: Let_def policy_step_def)

lemma pi_pi[simp]: "d â Dâ©D â¹ policy_step (policy_iteration d) = policy_iteration d"
using is_dec_det_pi
by (induction d rule: policy_iteration.induct) (auto simp: policy_step_def Let_def)

lemma policy_iteration_correct:
by (induction d rule: policy_iteration.induct)
(fastforce intro!: policy_step_eq_imp_opt is_dec_det_pi' simp del: policy_iteration.simps)
end

context MDP_finite_type begin
text â¹
The following proofs concern code generation, i.e. how to represent @{const ð«â©1} as a matrix.
âº

sublocale MDP_att_â
by (auto simp: A_ne finite_is_arg_max MDP_att_â_def MDP_att_â_axioms_def max_L_ex_def
has_arg_max_def MDP_reward_axioms)

definition "fun_to_matrix f = matrix (Î»v. (Ï j. f (vec_nth v) j))"
definition "Ek_mat d = fun_to_matrix (Î»v. ((ð«â©1 d) (Bfun v)))"
definition "nu_inv_mat d = fun_to_matrix ((Î»v. ((id_blinfun - l *â©R ð«â©1 d) (Bfun v))))"
definition "nu_mat d = fun_to_matrix (Î»v. ((âi. (l *â©R ð«â©1 d) ^^ i) (Bfun v)))"

lemma apply_nu_inv_mat:
"(id_blinfun - l *â©R ð«â©1 d) v = Bfun (Î»i. ((nu_inv_mat d) *v (vec_lambda v)) $i)" proof - have eq_onpI: "P x â¹ eq_onp P x x" for P x by(simp add: eq_onp_def) have "Real_Vector_Spaces.linear (Î»v. vec_lambda (((id_blinfun - l *â©R ð«â©1 d) (bfun.Bfun (($) v)))))"
by (auto simp del: real_scaleR_def intro: linearI
simp: scaleR_vec_def eq_onpI plus_vec_def vec_lambda_inverse plus_bfun.abs_eq[symmetric]
thus ?thesis
unfolding Ek_mat_def fun_to_matrix_def nu_inv_mat_def
by (auto simp: apply_bfun_inverse vec_lambda_inverse)
qed

lemma bounded_linear_vec_lambda: "bounded_linear (Î»x. vec_lambda (x :: 's ââ©b real))"
proof (intro bounded_linear_intro)
fix x :: "'s ââ©b real"
have "sqrt (â i â UNIV . (apply_bfun x i)â§2) â¤ (â i â UNIV . Â¦(apply_bfun x i)Â¦)"
using L2_set_le_sum_abs
unfolding L2_set_def
by auto
also have "(â i â UNIV . Â¦(apply_bfun x i)Â¦) â¤ (card (UNIV :: 's set) * (â¨xa. Â¦apply_bfun x xaÂ¦))"
by (auto intro!: cSup_upper sum_bounded_above)
finally show "norm (vec_lambda (apply_bfun x)) â¤ norm x * CARD('s)"
unfolding norm_vec_def norm_bfun_def dist_bfun_def L2_set_def
qed (auto simp: plus_vec_def scaleR_vec_def)

lemma bounded_linear_vec_lambda_blinfun:
shows "bounded_linear (Î»v. vec_lambda (apply_bfun (blinfun_apply f (bfun.Bfun (($) v)))))" using blinfun.bounded_linear_right by (fastforce intro: bounded_linear_compose[OF bounded_linear_vec_lambda] bounded_linear_bfun_nth bounded_linear_compose[of f]) lemma invertible_nu_inv_max: "invertible (nu_inv_mat d)" unfolding nu_inv_mat_def fun_to_matrix_def by (auto simp: matrix_invertible inv_norm_le' vec_lambda_inverse apply_bfun_inverse bounded_linear.linear[OF bounded_linear_vec_lambda_blinfun] intro!: exI[of _ "Î»v. (Ï j. (Î»v. (âi. (l *â©R ð«â©1 d) ^^ i) (Bfun v)) (vec_nth v) j)"]) end definition "least_arg_max f P = (LEAST x. is_arg_max f P x)" locale MDP_ord = MDP_finite_type A K r l for A and K :: "'s :: {finite, wellorder} Ã 'a :: {finite, wellorder} â 's pmf" and r l begin lemma â_fin_eq_det: "â v s = (â¨a â A s. Lâ©a a v s)" by (simp add: SUP_step_det_eq â_eq_SUP_det) lemma ââ©b_fin_eq_det: "ââ©b v s = (â¨a â A s. Lâ©a a v s)" by (simp add: SUP_step_det_eq ââ©b.rep_eq â_eq_SUP_det) sublocale MDP_PI_finite A K r l "Î»X. Least (Î»x. x â X)" by unfold_locales (auto intro: LeastI) end end # Theory Modified_Policy_Iteration (* Author: Maximilian SchÃ¤ffeler *) theory Modified_Policy_Iteration imports Policy_Iteration Value_Iteration begin section â¹Modified Policy Iterationâº locale MDP_MPI = MDP_finite_type A K r l + MDP_act A K r l arb_act for A and K :: "'s :: finite Ã 'a :: finite â 's pmf" and r l arb_act begin subsection â¹The Advantage Function @{term B}âº definition "B v s = (â¨d â Dâ©R. (r_dec d s + (l *â©R ð«â©1 d - id_blinfun) v s))" text "The function @{const B} denotes the advantage of choosing the optimal action vs. the current value estimate" lemma B_eq_â: "B v s = â v s - v s" proof - have *: "B v s = (â¨d â Dâ©R. L d v s - v s)" unfolding B_def L_def by (auto simp add: blinfun.bilinear_simps add_diff_eq) show ?thesis unfolding * proof (rule antisym) show "(â¨dâDâ©R. L d v s - v s) â¤ â v s - v s" unfolding â_def using ex_dec by (fastforce intro!: cSUP_upper cSUP_least) next have "bdd_above ((Î»d. L d v s - v s)  Dâ©R)" by (auto intro!: bounded_const bounded_minus_comp bounded_imp_bdd_above) thus "â v s - v s â¤ (â¨d â Dâ©R. L d v s - v s)" unfolding â_def diff_le_eq by (intro cSUP_least) (auto intro: cSUP_upper2 simp: diff_le_eq[symmetric]) qed qed text â¹@{const B} is a bounded function.âº lift_definition Bâ©b :: "('s ââ©b real) â 's ââ©b real" is "B" using ââ©b.rep_eq[symmetric] B_eq_â by (auto intro!: bfun_normI order.trans[OF abs_triangle_ineq4] add_mono abs_le_norm_bfun) lemma Bâ©b_eq_ââ©b: "Bâ©b v = ââ©b v - v" by (auto simp: ââ©b.rep_eq Bâ©b.rep_eq B_eq_â) lemma ââ©b_eq_SUP_Lâ©a: "ââ©b v s = (â¨a â A s. Lâ©a a v s)" using L_eq_Lâ©a_det ââ©b_eq_SUP_det SUP_step_det_eq by auto subsection â¹Optimization of the Value Function over Multiple Stepsâº definition "U m v s = (â¨d â Dâ©R. (Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d)^^m) v) s)" text â¹@{const U} expresses the value estimate obtained by optimizing the first @{term m} steps and afterwards using the current estimate.âº lemma U_zero [simp]: "U 0 v = v" unfolding U_def â_def by (auto simp: Î½â©b_fin.rep_eq) lemma U_one_eq_â: "U 1 v s = â v s" unfolding U_def â_def by (auto simp: Î½â©b_fin_eq_ð«â©X L_def blinfun.bilinear_simps) lift_definition Uâ©b :: "nat â ('s ââ©b real) â ('s ââ©b real)" is U proof - fix n v have "norm (Î½â©b_fin (mk_stationary d) m) â¤ (âi<m. l ^ i * râ©M)" for d m using abs_Î½_fin_le Î½â©b_fin.rep_eq by (auto intro!: norm_bound) moreover have "norm (((l *â©R ð«â©1 d)^^m) v) â¤ l ^ m * norm v" for d m by (auto simp: ð«â©X_const[symmetric] blinfun.bilinear_simps blincomp_scaleR_right simp del: ð«â©X_sconst intro!: boundedI order.trans[OF abs_le_norm_bfun] mult_left_mono) ultimately have *: "norm (Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d)^^m) v) â¤ (âi<m. l ^ i * râ©M) + l ^ m * norm v" for d m using norm_triangle_mono by blast show "U n v â bfun" using ex_dec order.trans[OF abs_le_norm_bfun *] by (fastforce simp: U_def intro!: bfun_normI cSup_abs_le) qed lemma Uâ©b_contraction: "dist (Uâ©b m v) (Uâ©b m u) â¤ l ^ m * dist v u" proof - have aux: "dist (Uâ©b m v s) (Uâ©b m u s) â¤ l ^ m * dist v u" if le: "Uâ©b m u s â¤ Uâ©b m v s" for s v u proof - let ?U = "Î»m v d. (Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d) ^^ m) v) s" have "Uâ©b m v s - Uâ©b m u s â¤ (â¨d â Dâ©R. ?U m v d - ?U m u d)" using bounded_stationary_Î½â©b_fin bounded_disc_ð«â©1 le unfolding Uâ©b.rep_eq U_def by (intro le_SUP_diff') (auto intro: bounded_plus_comp) also have "â¦ = (â¨d â Dâ©R. ((l *â©R ð«â©1 d) ^^ m) (v - u) s)" by (simp add: L_def scale_right_diff_distrib blinfun.bilinear_simps) also have "â¦ = (â¨d â Dâ©R. l^m * ((ð«â©1 d ^^ m) (v - u) s))" by (simp add: blincomp_scaleR_right blinfun.scaleR_left) also have "â¦ = l^m * (â¨d â Dâ©R. ((ð«â©1 d ^^ m) (v - u) s))" using Dâ©R_ne bounded_P bounded_disc_ð«â©1' by (auto intro: bounded_SUP_mul) also have "â¦ â¤ l^m * norm (â¨d â Dâ©R. ((ð«â©1 d ^^ m) (v - u) s))" by (simp add: mult_left_mono) also have "â¦ â¤ l^m * (â¨d â Dâ©R. norm (((ð«â©1 d ^^ m) (v - u) s)))" using Dâ©R_ne ex_dec bounded_norm_comp bounded_disc_ð«â©1' by (fastforce intro!: mult_left_mono) also have "â¦ â¤ l^m * (â¨d â Dâ©R. norm ((ð«â©1 d ^^ m) ((v - u))))" using ex_dec by (fastforce intro!: order.trans[OF norm_blinfun] abs_le_norm_bfun mult_left_mono cSUP_mono) also have "â¦ â¤ l^m * (â¨d â Dâ©R. norm ((v - u)))" using norm_ð«â©X_apply by (auto simp: ð«â©X_const[symmetric] cSUP_least mult_left_mono) also have "â¦ = l ^m * dist v u" by (auto simp: dist_norm) finally have "Uâ©b m v s - Uâ©b m u s â¤ l^m * dist v u" . thus ?thesis by (simp add: dist_real_def le) qed moreover have "Uâ©b m v s â¤ Uâ©b m u s â¹ dist (Uâ©b m v s) (Uâ©b m u s) â¤ l^m * dist v u" for u v s by (simp add: aux dist_commute) ultimately have "dist (Uâ©b m v s) (Uâ©b m u s) â¤ l^m * dist v u" for u v s using linear by blast thus "dist (Uâ©b m v) (Uâ©b m u) â¤ l^m * dist v u" by (simp add: dist_bound) qed lemma Uâ©b_conv: "â!v. Uâ©b (Suc m) v = v" "(Î»n. (Uâ©b (Suc m) ^^ n) v) â¢ (THE v. Uâ©b (Suc m) v = v)" proof - have *: "is_contraction (Uâ©b (Suc m))" unfolding is_contraction_def using Uâ©b_contraction[of "Suc m"] le_neq_trans[OF zero_le_disc] by (cases "l = 0") (auto intro!: power_Suc_less_one intro: exI[of _ "l^(Suc m)"]) show "â!v. Uâ©b (Suc m) v = v" "(Î»n. (Uâ©b (Suc m) ^^ n) v) â¢ (THE v. Uâ©b (Suc m) v = v)" using banach'[OF *] by auto qed lemma Uâ©b_convergent: "convergent (Î»n. (Uâ©b (Suc m) ^^ n) v)" by (intro convergentI[OF Uâ©b_conv(2)]) lemma Uâ©b_mono: assumes "v â¤ u" shows "Uâ©b m v â¤ Uâ©b m u" proof - have "Uâ©b m v s â¤ Uâ©b m u s" for s unfolding Uâ©b.rep_eq U_def proof (intro cSUP_mono, goal_cases) case 2 thus ?case by (simp add: bounded_imp_bdd_above bounded_disc_ð«â©1 bounded_plus_comp bounded_stationary_Î½â©b_fin) next case (3 n) thus ?case using less_eq_bfunD[OF ð«â©X_mono[OF assms]] by (auto simp: ð«â©X_const[symmetric] blincomp_scaleR_right blinfun.scaleR_left intro!: mult_left_mono exI) qed auto thus ?thesis using assms by auto qed lemma Uâ©b_le_ââ©b: "Uâ©b m v â¤ (ââ©b ^^ m) v" proof - have "Uâ©b m v s = (â¨d â Dâ©R. (L d^^ m) v s)" for m v s by (auto simp: L_iter Uâ©b.rep_eq ââ©b.rep_eq U_def â_def) thus ?thesis using L_iter_le_ââ©b ex_dec by (fastforce intro!: cSUP_least) qed lemma L_iter_le_Uâ©b: assumes "d â Dâ©R" shows "(L d^^m) v â¤ Uâ©b m v" using assms by (fastforce intro!: cSUP_upper bounded_imp_bdd_above simp: L_iter Uâ©b.rep_eq U_def bounded_disc_ð«â©1 bounded_plus_comp bounded_stationary_Î½â©b_fin) lemma lim_Uâ©b: "lim (Î»n. (Uâ©b (Suc m) ^^ n) v) = Î½â©b_opt" proof - have le_U: "Î½â©b_opt â¤ Uâ©b m Î½â©b_opt" for m proof - obtain d where d: "Î½_improving Î½â©b_opt (mk_dec_det d)" "d â Dâ©D" using ex_improving_det by auto have "Î½â©b_opt = (L (mk_dec_det d) ^^ m) Î½â©b_opt" by (induction m) (metis L_Î½_fix_iff ââ©b_opt Î½_improving_imp_ââ©b d(1) funpow_swap1)+ thus ?thesis using â¹d â Dâ©Dâº by (auto intro!: order.trans[OF _ L_iter_le_Uâ©b]) qed have "Uâ©b m Î½â©b_opt â¤ Î½â©b_opt" for m using â_inc_le_opt by (auto intro!: order.trans[OF Uâ©b_le_ââ©b] simp: funpow_swap1) hence "Uâ©b (Suc m) Î½â©b_opt = Î½â©b_opt" using le_U by (simp add: antisym) moreover have "(lim (Î»n. (Uâ©b (Suc m) ^^n) v)) = Uâ©b (Suc m) (lim (Î»n. (Uâ©b (Suc m) ^^n) v))" using limI[OF Uâ©b_conv(2)] theI'[OF Uâ©b_conv(1)] by auto ultimately show ?thesis using Uâ©b_conv(1) by metis qed lemma Uâ©b_tendsto: "(Î»n. (Uâ©b (Suc m) ^^ n) v) â¢ Î½â©b_opt" using lim_Uâ©b Uâ©b_convergent convergent_LIMSEQ_iff by metis lemma Uâ©b_fix_unique: "Uâ©b (Suc m) v = v â· v = Î½â©b_opt" using theI'[OF Uâ©b_conv(1)] Uâ©b_conv(1) by (auto simp: LIMSEQ_unique[OF Uâ©b_tendsto Uâ©b_conv(2)[of m]]) lemma dist_Uâ©b_opt: "dist (Uâ©b m v) Î½â©b_opt â¤ l^m * dist v Î½â©b_opt" proof - have "dist (Uâ©b m v) Î½â©b_opt = dist (Uâ©b m v) (Uâ©b m Î½â©b_opt)" by (metis Uâ©b.abs_eq Uâ©b_fix_unique U_zero apply_bfun_inverse not0_implies_Suc) also have "â¦ â¤ l^m * dist v Î½â©b_opt" by (meson Uâ©b_contraction) finally show ?thesis . qed subsection â¹Expressing a Single Step of Modified Policy Iterationâº text â¹The function @{term W} equals the value computed by the Modified Policy Iteration Algorithm in a single iteration. The right hand addend in the definition describes the advantage of using the optimal action for the first m steps. âº definition "W d m v = v + (âi < m. (l *â©R ð«â©1 d)^^i) (Bâ©b v)" lemma W_eq_L_iter: assumes "Î½_improving v d" shows "W d m v = (L d^^m) v" proof - have "(âi<m. (l *â©R ð«â©1 d)^^i) (ââ©b v) = (âi<m. (l *â©R ð«â©1 d)^^i) (L d v)" using Î½_improving_imp_ââ©b assms by auto hence "W d m v = v + ((âi<m. (l *â©R ð«â©1 d)^^i) (L d v)) - (âi<m. (l *â©R ð«â©1 d)^^i) v" by (auto simp: W_def Bâ©b_eq_ââ©b blinfun.bilinear_simps algebra_simps ) also have "â¦ = v + Î½â©b_fin (mk_stationary d) m + (âi<m. ((l *â©R ð«â©1 d)^^i) ((l *â©R ð«â©1 d) v)) - (âi<m. (l *â©R ð«â©1 d)^^i) v" unfolding L_def by (auto simp: Î½â©b_fin_eq blinfun.bilinear_simps blinfun.sum_left scaleR_right.sum) also have "â¦ = v + Î½â©b_fin (mk_stationary d) m + (âi<m. ((l *â©R ð«â©1 d)^^Suc i) v) - (âi<m. (l *â©R ð«â©1 d)^^i) v" by (auto simp del: blinfunpow.simps simp: blinfunpow_assoc) also have "â¦ = Î½â©b_fin (mk_stationary d) m + (âi<Suc m. ((l *â©R ð«â©1 d)^^ i) v) - (âi<m. (l *â©R ð«â©1 d)^^ i) v" by (subst sum.lessThan_Suc_shift) auto also have "â¦ = Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d)^^m) v" by (simp add: blinfun.sum_left) also have "â¦ = (L d ^^ m) v" using L_iter by auto finally show ?thesis . qed lemma W_le_Uâ©b: assumes "v â¤ u" "Î½_improving v d" shows "W d m v â¤ Uâ©b m u" proof - have "Uâ©b m u - W d m v â¥ Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d) ^^ m) u - (Î½â©b_fin (mk_stationary d) m + ((l *â©R ð«â©1 d)^^m) v)" using Î½_improving_D_MR assms(2) bounded_stationary_Î½â©b_fin bounded_disc_ð«â©1 by (fastforce intro!: diff_mono bounded_imp_bdd_above cSUP_upper bounded_plus_comp simp: Uâ©b.rep_eq U_def L_iter W_eq_L_iter) hence *: "Uâ©b m u - W d m v â¥ ((l *â©R ð«â©1 d) ^^ m) (u - v)" by (auto simp: blinfun.diff_right) show "W d m v â¤ Uâ©b m u" using order.trans[OF ð«â©1_n_disc_pos[unfolded blincomp_scaleR_right[symmetric]] *] assms by auto qed lemma W_ge_ââ©b: assumes "v â¤ u" "0 â¤ Bâ©b u" "Î½_improving u d'" shows "ââ©b v â¤ W d' (Suc m) u" proof - have "ââ©b v â¤ u + Bâ©b u" using assms(1) ââ©b_mono Bâ©b_eq_ââ©b by auto also have "â¦ â¤ W d' (Suc m) u" using L_mono Î½_improving_imp_ââ©b assms(3) assms by (induction m) (auto simp: W_eq_L_iter Bâ©b_eq_ââ©b) finally show ?thesis . qed lemma Bâ©b_le: assumes "Î½_improving v d" shows "Bâ©b v + (l *â©R ð«â©1 d - id_blinfun) (u - v) â¤ Bâ©b u" proof - have "r_decâ©b d + (l *â©R ð«â©1 d - id_blinfun) u â¤ Bâ©b u" using L_def L_le_ââ©b assms by (auto simp: Bâ©b_eq_ââ©b ââ©b.rep_eq â_def blinfun.bilinear_simps) moreover have "Bâ©b v = r_decâ©b d + (l *â©R ð«â©1 d - id_blinfun) v" using assms by (auto simp: Bâ©b_eq_ââ©b Î½_improving_imp_ââ©b[of _ d] L_def blinfun.bilinear_simps) ultimately show ?thesis by (simp add: blinfun.diff_right) qed lemma ââ©b_W_ge: assumes "u â¤ ââ©b u" "Î½_improving u d" shows "W d m u â¤ ââ©b (W d m u)" proof - have "0 â¤ ((l *â©R ð«â©1 d) ^^ m) (Bâ©b u)" by (metis Bâ©b_eq_ââ©b ð«â©1_n_disc_pos assms(1) blincomp_scaleR_right diff_ge_0_iff_ge) also have "â¦ = ((l *â©R ð«â©1 d)^^0 + (âi < m. (l *â©R ð«â©1 d)^^(Suc i))) (Bâ©b u) - (âi < m. (l *â©R ð«â©1 d)^^ i) (Bâ©b u)" by (subst sum.lessThan_Suc_shift[symmetric]) (auto simp: blinfun.diff_left[symmetric]) also have "â¦ = Bâ©b u + ((l *â©R ð«â©1 d - id_blinfun) oâ©L (âi < m. (l *â©R ð«â©1 d)^^i)) (Bâ©b u)" by (auto simp: blinfun.bilinear_simps sum_subtractf) also have "â¦ = Bâ©b u + (l *â©R ð«â©1 d - id_blinfun) (W d m u - u)" by (auto simp: W_def sum.lessThan_Suc[unfolded lessThan_Suc_atMost]) also have "â¦ â¤ Bâ©b (W d m u)" using Bâ©b_le assms(2) by blast finally have "0 â¤ Bâ©b (W d m u)" . thus ?thesis using Bâ©b_eq_ââ©b by auto qed subsection â¹Computing the Bellman Operator over Multiple Stepsâº definition L_pow :: "('s ââ©b real) â ('s â 'a) â nat â ('s ââ©b real)" where "L_pow v d m = (L (mk_dec_det d) ^^ Suc m) v" lemma sum_telescope': "(âiâ¤k. f (Suc i) - f i ) = f (Suc k) - (f 0 :: 'c :: ab_group_add)" using sum_telescope[of "-f" k] by auto (* eq 6.5.7 *) lemma L_pow_eq: assumes "Î½_improving v (mk_dec_det d)" shows "L_pow v d m = v + (âi â¤ m. ((l *â©R ð«â©1 (mk_dec_det d))^^i)) (Bâ©b v)" proof - let ?d = "(mk_dec_det d)" have "(âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (Bâ©b v) = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (L ?d v) - (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) v" using assms by (auto simp: Bâ©b_eq_ââ©b blinfun.bilinear_simps Î½_improving_imp_ââ©b) also have "â¦ = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (r_decâ©b ?d) + (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) ((l *â©R ð«â©1 ?d) v) - (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) v" by (simp add: L_def blinfun.bilinear_simps) also have "â¦ = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (r_decâ©b ?d) + (âi â¤ m. ((l *â©R ð«â©1 ?d)^^Suc i)) v - (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) v" by (auto simp: blinfun.sum_left blinfunpow_assoc simp del: blinfunpow.simps) also have "â¦ = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (r_decâ©b ?d) + (âi â¤ m. ((l *â©R ð«â©1 ?d)^^Suc i) - (l *â©R ð«â©1 ?d)^^i) v" by (simp add: blinfun.diff_left sum_subtractf) also have "â¦ = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (r_decâ©b ?d) + ((l *â©R ð«â©1 ?d)^^Suc m) v - v" by (subst sum_telescope') (auto simp: blinfun.bilinear_simps) finally have "(âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (Bâ©b v) = (âi â¤ m. ((l *â©R ð«â©1 ?d)^^i)) (r_decâ©b ?d) + ((l *â©R ð«â©1 ?d)^^Suc m) v - v" . moreover have "L_pow v d m = Î½â©b_fin (mk_stationary_det d) (Suc m) + ((l *â©R ð«â©1 ?d)^^Suc m) v" by (simp only: L_pow_def L_iter lessThan_Suc_atMost[symmetric]) ultimately show ?thesis by (auto simp: Î½â©b_fin_eq lessThan_Suc_atMost) qed lemma L_pow_eq_W: assumes "d â Dâ©D" shows "L_pow v (policy_improvement d v) m = W (mk_dec_det (policy_improvement d v)) (Suc m) v" using assms policy_improvement_improving by (auto simp: W_eq_L_iter L_pow_def) lemma L_pow_ââ©b_mono_inv: assumes "d â Dâ©D" "v â¤ ââ©b v" shows "L_pow v (policy_improvement d v) m â¤ ââ©b (L_pow v (policy_improvement d v) m)" using assms L_pow_eq_W ââ©b_W_ge policy_improvement_improving by auto subsection â¹The Modified Policy Iteration Algorithmâº context fixes d0 :: "'s â 'a" fixes v0 :: "'s ââ©b real" fixes m :: "nat â ('s ââ©b real) â nat" assumes d0: "d0 â Dâ©D" begin text â¹We first define a function that executes the algorithm for n steps.âº fun mpi :: "nat â (('s â 'a) Ã ('s ââ©b real))" where "mpi 0 = (policy_improvement d0 v0, v0)" | "mpi (Suc n) = (let (d, v) = mpi n; v' = L_pow v d (m n v) in (policy_improvement d v', v'))" definition "mpi_val n = snd (mpi n)" definition "mpi_pol n = fst (mpi n)" lemma mpi_pol_zero[simp]: "mpi_pol 0 = policy_improvement d0 v0" unfolding mpi_pol_def by auto lemma mpi_pol_Suc: "mpi_pol (Suc n) = policy_improvement (mpi_pol n) (mpi_val (Suc n))" by (auto simp: case_prod_beta' Let_def mpi_pol_def mpi_val_def) lemma mpi_pol_is_dec_det: "mpi_pol n â Dâ©D" unfolding mpi_pol_def using policy_improvement_is_dec_det d0 by (induction n) (auto simp: Let_def split: prod.splits) lemma Î½_improving_mpi_pol: "Î½_improving (mpi_val n) (mk_dec_det (mpi_pol n))" using d0 policy_improvement_improving mpi_pol_is_dec_det mpi_pol_Suc by (cases n) (auto simp: mpi_pol_def mpi_val_def) lemma mpi_val_zero[simp]: "mpi_val 0 = v0" unfolding mpi_val_def by auto lemma mpi_val_Suc: "mpi_val (Suc n) = L_pow (mpi_val n) (mpi_pol n) (m n (mpi_val n))" unfolding mpi_val_def mpi_pol_def by (auto simp: case_prod_beta' Let_def) lemma mpi_val_eq: "mpi_val (Suc n) = mpi_val n + (âi â¤ m n (mpi_val n). (l *â©R ð«â©1 (mk_dec_det (mpi_pol n))) ^^ i) (Bâ©b (mpi_val n))" using L_pow_eq[OF Î½_improving_mpi_pol] mpi_val_Suc by auto text â¹Value Iteration is a special case of MPI where @{term "ân v. m n v = 0"}.âº lemma mpi_includes_value_it: assumes "ân v. m n v = 0" shows "mpi_val (Suc n) = ââ©b (mpi_val n)" using assms Bâ©b_eq_ââ©b mpi_val_eq by auto subsection â¹Convergence Proofâº text â¹We define the sequence @{term w} as an upper bound for the values of MPI.âº fun w where "w 0 = v0" | "w (Suc n) = Uâ©b (Suc (m n (mpi_val n))) (w n)" lemma dist_Î½â©b_opt: "dist (w (Suc n)) Î½â©b_opt â¤ l * dist (w n) Î½â©b_opt" by (fastforce simp: algebra_simps intro: order.trans[OF dist_Uâ©b_opt] mult_left_mono power_le_one mult_left_le_one_le order.strict_implies_order) lemma dist_Î½â©b_opt_n: "dist (w n) Î½â©b_opt â¤ l^n * dist v0 Î½â©b_opt" by (induction n) (fastforce simp: algebra_simps intro: order.trans[OF dist_Î½â©b_opt] mult_left_mono)+ lemma w_conv: "w â¢ Î½â©b_opt" proof - have "(Î»n. l^n * dist v0 Î½â©b_opt) â¢ 0" using LIMSEQ_realpow_zero by (cases "v0 = Î½â©b_opt") auto then show ?thesis by (fastforce intro: metric_LIMSEQ_I order.strict_trans1[OF dist_Î½â©b_opt_n] simp: LIMSEQ_def) qed text â¹MPI converges monotonically to the optimal value from below. The iterates are sandwiched between @{const ââ©b} from below and @{const Uâ©b} from above.âº theorem mpi_conv: assumes "v0 â¤ ââ©b v0" shows "mpi_val â¢ Î½â©b_opt" and "ân. mpi_val n â¤ mpi_val (Suc n)" proof - define y where "y n = (ââ©b^^n) v0" for n have aux: "mpi_val n â¤ ââ©b (mpi_val n) â§ mpi_val n â¤ mpi_val (Suc n) â§ y n â¤ mpi_val n â§ mpi_val n â¤ w n" for n proof (induction n) case 0 show ?case using assms Bâ©b_eq_ââ©b unfolding y_def by (auto simp: mpi_val_eq blinfun.sum_left ð«â©1_n_disc_pos blincomp_scaleR_right sum_nonneg) next case (Suc n) have val_eq_W: "mpi_val (Suc n) = W (mk_dec_det (mpi_pol n)) (Suc (m n (mpi_val n))) (mpi_val n)" using Î½_improving_mpi_pol mpi_val_Suc W_eq_L_iter L_pow_def by auto hence *: "mpi_val (Suc n) â¤ ââ©b (mpi_val (Suc n))" using Suc.IH ââ©b_W_ge Î½_improving_mpi_pol by presburger moreover have "mpi_val (Suc n) â¤ mpi_val (Suc (Suc n))" using * by (simp add: Bâ©b_eq_ââ©b mpi_val_eq ð«â©1_n_disc_pos blincomp_scaleR_right blinfun.sum_left sum_nonneg) moreover have "mpi_val (Suc n) â¤ w (Suc n)" using Suc.IH Î½_improving_mpi_pol by (auto simp: val_eq_W intro: order.trans[OF _ W_le_Uâ©b]) moreover have "y (Suc n) â¤ mpi_val (Suc n)" using Suc.IH Î½_improving_mpi_pol W_ge_ââ©b by (auto simp: y_def Bâ©b_eq_ââ©b val_eq_W) ultimately show ?case by auto qed thus "mpi_val n â¤ mpi_val (Suc n)" for n by auto have "y â¢ Î½â©b_opt" using ââ©b_lim y_def by presburger thus "mpi_val â¢ Î½â©b_opt" using aux by (auto intro: tendsto_bfun_sandwich[OF _ w_conv]) qed subsection â¹$\epsilon$-Optimalityâº text â¹This gives an upper bound on the error of MPI.âº lemma mpi_pol_eps_opt: assumes "2 * l * dist (mpi_val n) (ââ©b (mpi_val n)) < eps * (1 - l)" "eps > 0" shows "dist (Î½â©b (mk_stationary_det (mpi_pol n))) (ââ©b (mpi_val n)) â¤ eps / 2" proof - let ?p = "mk_stationary_det (mpi_pol n)" let ?d = "mk_dec_det (mpi_pol n)" let ?v = "mpi_val n" have "dist (Î½â©b ?p) (ââ©b ?v) = dist (L ?d (Î½â©b ?p)) (ââ©b ?v)" using L_Î½_fix by force also have "â¦ = dist (L ?d (Î½â©b ?p)) (L ?d ?v)" by (metis Î½_improving_imp_ââ©b Î½_improving_mpi_pol) also have "â¦ â¤ dist (L ?d (Î½â©b ?p)) (L ?d (ââ©b ?v)) + dist (L ?d (ââ©b ?v)) (L ?d ?v)" using dist_triangle by blast also have "â¦ â¤ l * dist (Î½â©b ?p) (ââ©b ?v) + dist (L ?d (ââ©b ?v)) (L ?d ?v)" using contraction_L by auto also have "â¦ â¤ l * dist (Î½â©b ?p) (ââ©b ?v) + l * dist (ââ©b ?v) ?v" using contraction_L by auto finally have "dist (Î½â©b ?p) (ââ©b ?v) â¤ l * dist (Î½â©b ?p) (ââ©b ?v) + l * dist (ââ©b ?v) ?v". hence *:"(1-l) * dist (Î½â©b ?p) (ââ©b ?v) â¤ l * dist (ââ©b ?v) ?v" by (auto simp: left_diff_distrib) thus ?thesis proof (cases "l = 0") case True thus ?thesis using assms * by auto next case False have **: "dist (ââ©b ?v) (mpi_val n) < eps * (1 - l) / (2 * l)" using False le_neq_trans[OF zero_le_disc False[symmetric]] assms by (auto simp: dist_commute pos_less_divide_eq Groups.mult_ac(2)) have "dist (Î½â©b ?p) (ââ©b ?v) â¤ (l/ (1-l)) * dist (ââ©b ?v) ?v" using * by (auto simp: mult.commute pos_le_divide_eq) also have "â¦ â¤ (l/ (1-l)) * (eps * (1 - l) / (2 * l))" using ** by (fastforce intro!: mult_left_mono simp: divide_nonneg_pos) also have "â¦ = eps / 2" using False disc_lt_one by (auto simp: order.strict_iff_order) finally show "dist (Î½â©b ?p) (ââ©b ?v) â¤ eps / 2". qed qed lemma mpi_pol_opt: assumes "2 * l * dist (mpi_val n) (ââ©b (mpi_val n)) < eps * (1 - l)" "eps > 0" shows "dist (Î½â©b (mk_stationary_det (mpi_pol n))) (Î½â©b_opt) < eps" proof - have "dist (Î½â©b (mk_stationary_det (mpi_pol n))) (Î½â©b_opt) â¤ eps/2 + dist (ââ©b (mpi_val n)) Î½â©b_opt" by (metis mpi_pol_eps_opt[OF assms] dist_commute dist_triangle_le add_right_mono) thus ?thesis using dist_ââ©b_opt_eps assms by fastforce qed lemma mpi_val_term_ex: assumes "v0 â¤ ââ©b v0" "eps > 0" shows "ân. 2 * l * dist (mpi_val n) (ââ©b (mpi_val n)) < eps * (1 - l)" proof - note dist_ââ©b_lt_dist_opt have "(Î»n. dist (mpi_val n) Î½â©b_opt) â¢ 0" using mpi_conv(1)[OF assms(1)] tendsto_dist_iff by blast hence "(Î»n. dist (mpi_val n) (ââ©b (mpi_val n))) â¢ 0" using dist_ââ©b_lt_dist_opt by (auto simp: metric_LIMSEQ_I intro: tendsto_sandwich[of "Î»_. 0" _ _ "Î»n. 2 * dist (mpi_val n) Î½â©b_opt"]) hence "âe >0. ân. dist (mpi_val n) (ââ©b (mpi_val n)) < e" by (fastforce dest!: metric_LIMSEQ_D) hence "l â 0 â¹ ân. dist (mpi_val n) (ââ©b (mpi_val n)) < eps * (1 - l) / (2 * l)" by (simp add: assms order.not_eq_order_implies_strict) thus "ân. (2 * l) * dist (mpi_val n) (ââ©b (mpi_val n)) < eps * (1 - l)" using assms le_neq_trans[OF zero_le_disc] by (cases "l = 0") (auto simp: mult.commute pos_less_divide_eq) qed end subsection â¹Unbounded MPIâº context fixes eps Î´ :: real and M :: nat begin function (domintros) mpi_algo where "mpi_algo d v m = ( if 2 * l * dist v (ââ©b v) < eps * (1 - l) then (policy_improvement d v, v) else mpi_algo (policy_improvement d v) (L_pow v (policy_improvement d v) (m 0 v)) (Î»n. m (Suc n)))" by auto text â¹We define a tailrecursive version of @{const mpi} which more closely resembles @{const mpi_algo}.âº fun mpi' where "mpi' d v 0 m = (policy_improvement d v, v)" | "mpi' d v (Suc n) m = ( let d' = policy_improvement d v; v' = L_pow v d' (m 0 v) in mpi' d' v' n (Î»n. m (Suc n)))" lemma mpi_Suc': assumes "d â Dâ©D" shows "mpi d v m (Suc n) = mpi (policy_improvement d v) (L_pow v (policy_improvement d v) (m 0 v)) (Î»a. m (Suc a)) n" using assms policy_improvement_is_dec_det by (induction n rule: nat.induct) (auto simp: Let_def) lemma assumes "d â Dâ©D" shows "mpi d v m n = mpi' d v n m" using assms proof (induction n arbitrary: d v m rule: nat.induct) case (Suc nat) thus ?case using policy_improvement_is_dec_det by (auto simp: Let_def mpi_Suc'[OF Suc(2)] Suc.IH[symmetric]) qed auto lemma termination_mpi_algo: assumes "eps > 0" "d â Dâ©D" "v â¤ ââ©b v" shows "mpi_algo_dom (d, v, m)" proof - define n where "n = (LEAST n. 2 * l * dist (mpi_val d v m n) (ââ©b (mpi_val d v m n)) < eps * (1 - l))" (is "n = (LEAST n. ?P d v m n)") have least0: "ân. P n â¹ (LEAST n. P n) = (0 :: nat) â¹ P 0" for P by (metis LeastI_ex) from n_def assms show ?thesis proof (induction n arbitrary: v d m) case 0 have "2 * l * dist (mpi_val d v m 0) (ââ©b (mpi_val d v m 0)) < eps * (1 - l)" using least0 mpi_val_term_ex 0 by (metis (no_types, lifting)) thus ?case using 0 mpi_algo.domintros mpi_val_zero by (metis (no_types, opaque_lifting)) next case (Suc n v d m) let ?d = "policy_improvement d v" have "Suc n = Suc (LEAST n. 2 * l * dist (mpi_val d v m (Suc n)) (ââ©b (mpi_val d v m (Suc n))) < eps * (1 - l))" using mpi_val_term_ex[OF Suc.prems(3) â¹v â¤ ââ©b vâº â¹0 < epsâº, of m] Suc.prems by (subst Nat.Least_Suc[symmetric]) (auto intro: LeastI_ex) hence "n = (LEAST n. 2 * l * dist (mpi_val d v m (Suc n)) (ââ©b (mpi_val d v m (Suc n))) < eps * (1 - l))" by auto hence n_eq: "n = (LEAST n. 2 * l * dist (mpi_val ?d (L_pow v ?d (m 0 v)) (Î»a. m (Suc a)) n) (ââ©b (mpi_val ?d (L_pow v ?d (m 0 v)) (Î»a. m (Suc a)) n)) < eps * (1 - l))" using Suc.prems mpi_Suc' by (auto simp: is_dec_det_pi mpi_val_def) have "Â¬ 2 * l * dist v (ââ©b v) < eps * (1 - l)" using Suc mpi_val_zero by force moreover have "mpi_algo_dom (?d, L_pow v ?d (m 0 v), Î»a. m (Suc a))" using Suc.IH[OF n_eq â¹0 < epsâº] Suc.prems is_dec_det_pi L_pow_ââ©b_mono_inv by auto ultimately show ?case using mpi_algo.domintros by blast qed qed abbreviation "mpi_alg_rec d v m â¡ (if 2 * l * dist v (ââ©b v) < eps * (1 - l) then (policy_improvement d v, v) else mpi_algo (policy_improvement d v) (L_pow v (policy_improvement d v) (m 0 v)) (Î»n. m (Suc n)))" lemma mpi_algo_def': assumes "d â Dâ©D" "v â¤ ââ©b v" "eps > 0" shows "mpi_algo d v m = mpi_alg_rec d v m" using mpi_algo.psimps termination_mpi_algo assms by auto lemma mpi_algo_eq_mpi: assumes "d â Dâ©D" "v â¤ ââ©b v" "eps > 0" shows "mpi_algo d v m = mpi d v m (LEAST n. 2 * l * dist (mpi_val d v m n) (ââ©b (mpi_val d v m n)) < eps * (1 - l))" proof - define n where "n = (LEAST n. 2 * l * dist (mpi_val d v m n) (ââ©b (mpi_val d v m n)) < eps * (1 - l))" (is "n = (LEAST n. ?P d v m n)") from n_def assms show ?thesis proof (induction n arbitrary: d v m) case 0 have "?P d v m 0" by (metis (no_types, lifting) assms(3) LeastI_ex 0 mpi_val_term_ex) thus ?case using assms 0 by (auto simp: mpi_val_def mpi_algo_def') next case (Suc n) hence not0: "Â¬ (2 * l * dist v (ââ©b v) < eps * (1 - l))" using Suc(3) mpi_val_zero by auto obtain n' where "2 * l * dist (mpi_val d v m n') (ââ©b (mpi_val d v m n')) < eps * (1 - l)" using mpi_val_term_ex[OF Suc(3) Suc(4), of _ m] assms by blast hence "n = (LEAST n. ?P d v m (Suc n))" using Suc(2) Suc by (subst (asm) Least_Suc) auto hence "n = (LEAST n. ?P (policy_improvement d v) (L_pow v (policy_improvement d v) (m 0 v)) (Î»n. m (Suc n)) n)" using Suc(3) policy_improvement_is_dec_det mpi_Suc' by (auto simp: mpi_val_def) hence "mpi_algo d v m = mpi d v m (Suc n)" unfolding mpi_algo_def'[OF Suc.prems(2-4)] using Suc(1) Suc.prems(2-4) is_dec_det_pi mpi_Suc' not0 L_pow_ââ©b_mono_inv by force thus ?case using Suc.prems(1) by presburger qed qed lemma mpi_algo_opt: assumes "v0 â¤ ââ©b v0" "eps > 0" "d â Dâ©D" shows "dist (Î½â©b (mk_stationary_det (fst (mpi_algo d v0 m)))) Î½â©b_opt < eps" proof - let ?P = "Î»n. 2 * l * dist (mpi_val d v0 m n) (ââ©b (mpi_val d v0 m n)) < eps * (1 - l)" let ?n = "Least ?P" have "mpi_algo d v0 m = mpi d v0 m ?n" and "?P ?n" using mpi_algo_eq_mpi LeastI_ex[OF mpi_val_term_ex] assms by auto thus ?thesis using assms by (auto simp: mpi_pol_opt mpi_pol_def[symmetric]) qed end subsection â¹Initial Value Estimate @{term v0_mpi}âº text â¹We define an initial estimate of the value function for which Modified Policy Iteration always terminates.âº abbreviation "r_min â¡ (â¨ s' a. r (s', a))" definition "v0_mpi s = r_min / (1 - l)" lift_definition v0_mpiâ©b :: "'s ââ©b real" is "v0_mpi" by fastforce lemma v0_mpiâ©b_le_ââ©b: "v0_mpiâ©b â¤ ââ©b v0_mpiâ©b" proof (rule less_eq_bfunI) fix x have "r_min â¤ r (s, a)" for s a by (fastforce intro: cInf_lower2) hence "r_min â¤ (1-l) * r (s, a) + l * r_min" for s a using disc_lt_one zero_le_disc by (meson order_less_imp_le order_refl segment_bound_lemma) hence "r_min / (1 - l) â¤ ((1-l) * r (s, a) + l * r_min) / (1 - l)" for s a using order_less_imp_le[OF disc_lt_one] by (auto intro!: divide_right_mono) hence "r_min / (1 - l) â¤ r (s, a) + (l * r_min) / (1 - l)" for s a using disc_lt_one by (auto simp: add_divide_distrib) thus "v0_mpiâ©b x â¤ ââ©b v0_mpiâ©b x" unfolding ââ©b_eq_SUP_Lâ©a v0_mpiâ©b.rep_eq v0_mpi_def by (auto simp: A_ne intro: cSUP_upper2[where x = "arb_act (A x)"]) qed subsection â¹An Instance of Modified Policy Iteration with a Valid Conservative Initial Value Estimateâº definition "mpi_user eps m = ( if eps â¤ 0 then undefined else mpi_algo eps (Î»x. arb_act (A x)) v0_mpiâ©b m)" lemma mpi_user_eq: assumes "eps > 0" shows "mpi_user eps = mpi_alg_rec eps (Î»x. arb_act (A x)) v0_mpiâ©b" using v0_mpiâ©b_le_ââ©b assms by (auto simp: mpi_user_def mpi_algo_def' A_ne is_dec_det_def) lemma mpi_user_opt: assumes "eps > 0" shows "dist (Î½â©b (mk_stationary_det (fst (mpi_user eps n)))) Î½â©b_opt < eps" unfolding mpi_user_def using assms by (auto intro: mpi_algo_opt simp: is_dec_det_def A_ne v0_mpiâ©b_le_ââ©b) end end y> # Theory Matrix_Util theory Matrix_Util imports "HOL-Analysis.Analysis" begin section â¹Matricesâº proposition scalar_matrix_assoc': fixes C :: "('b::real_algebra_1)^'m^'n" shows "k *â©R (C ** D) = C ** (k *â©R D)" by (simp add: matrix_matrix_mult_def sum_distrib_left mult_ac vec_eq_iff scaleR_sum_right) subsection â¹Nonnegative Matricesâº lemma nonneg_matrix_nonneg [dest]: "0 â¤ m â¹ 0 â¤ m$ i $j" by (simp add: Finite_Cartesian_Product.less_eq_vec_def) lemma matrix_mult_mono: assumes "0 â¤ E" "0 â¤ C" "(E :: real^'c^'c) â¤ B" "C â¤ D" shows "E ** C â¤ B ** D" using order.trans[OF assms(1) assms(3)] assms unfolding Finite_Cartesian_Product.less_eq_vec_def by (auto intro!: sum_mono mult_mono simp: matrix_matrix_mult_def) lemma nonneg_matrix_mult: "0 â¤ (C :: ('b::{field, ordered_ring})^_^_) â¹ 0 â¤ D â¹ 0 â¤ C ** D" unfolding Finite_Cartesian_Product.less_eq_vec_def by (auto simp: matrix_matrix_mult_def intro!: sum_nonneg) lemma zero_le_mat_iff [simp]: "0 â¤ mat (x :: 'c :: {zero, order}) â· 0 â¤ x" by (auto simp: Finite_Cartesian_Product.less_eq_vec_def mat_def) lemma nonneg_mat_ge_zero: "0 â¤ Q â¹ 0 â¤ v â¹ 0 â¤ Q *v (v :: real^'c)" unfolding Finite_Cartesian_Product.less_eq_vec_def by (auto intro!: sum_nonneg simp: matrix_vector_mult_def) lemma nonneg_mat_mono: "0 â¤ Q â¹ u â¤ v â¹ Q *v u â¤ Q *v (v :: real^'c)" using nonneg_mat_ge_zero[of Q "v - u"] by (simp add: vec.diff) lemma nonneg_mult_imp_nonneg_mat: assumes "âv. v â¥ 0 â¹ X *v v â¥ 0" shows "X â¥ (0 :: real ^ _ ^_)" proof - { assume "Â¬ (0 â¤ X)" then obtain i j where neg: "X$ i $j < 0" by (metis less_eq_vec_def not_le zero_index) let ?v = "Ï k. if j = k then 1::real else 0" have "(X *v ?v)$ i < 0"
using neg
by (auto simp: matrix_vector_mult_def if_distrib cong: if_cong)
hence "?v â¥ 0 â§ Â¬ ((X *v ?v) â¥ 0)"
by (auto simp: less_eq_vec_def not_le)
hence "âv. v â¥ 0 â§ Â¬ X *v v â¥ 0"
by blast
}
thus ?thesis
using assms by auto
qed

lemma nonneg_mat_iff:
"(X â¥ (0 :: real ^ _ ^_)) â· (âv. v â¥ 0 â¶ X *v v â¥ 0)"
using nonneg_mat_ge_zero nonneg_mult_imp_nonneg_mat by auto

lemma mat_le_iff: "(X â¤ Y) â· (âxâ¥0. (X::real^_^_) *v x â¤ Y *v x)"
by (metis diff_ge_0_iff_ge matrix_vector_mult_diff_rdistrib nonneg_mat_iff)

subsection â¹Matrix Powersâº

(* copied from Perron-Frobenius *)
primrec matpow :: "'a::semiring_1^'n^'n â nat â 'a^'n^'n" where
matpow_0:   "matpow A 0 = mat 1" |
matpow_Suc: "matpow A (Suc n) = (matpow A n) ** A"

lemma nonneg_matpow: "0 â¤ X â¹ 0 â¤ matpow (X :: real ^ _ ^ _) i"
by (induction i) (auto simp: nonneg_matrix_mult)

lemma matpow_mono: "0 â¤ C â¹ C â¤ D â¹ matpow (C :: real^_^_) n â¤ matpow D n"
by (induction n) (auto intro!: matrix_mult_mono nonneg_matpow)

lemma matpow_scaleR: "matpow (c *â©R (X :: 'b :: real_algebra_1^_^_)) n = (c^n) *â©R (matpow X) n"
proof (induction n arbitrary: X c)
case (Suc n)
lemma matrix_vector_mult_code': "(X *v x) $i = (âjâUNIV. X$ i $j * x$ j)"
`