diff --git a/thys/Frequency_Moments/Frequency_Moment_0.thy b/thys/Frequency_Moments/Frequency_Moment_0.thy --- a/thys/Frequency_Moments/Frequency_Moment_0.thy +++ b/thys/Frequency_Moments/Frequency_Moment_0.thy @@ -1,1314 +1,1314 @@ section \Frequency Moment $0$\label{sec:f0}\ theory Frequency_Moment_0 imports Frequency_Moments_Preliminary_Results Median_Method.Median K_Smallest Universal_Hash_Families.Carter_Wegman_Hash_Family Frequency_Moments Landau_Ext Product_PMF_Ext - Universal_Hash_Families.Field + Universal_Hash_Families.Universal_Hash_Families_More_Finite_Fields begin text \This section contains a formalization of a new algorithm for the zero-th frequency moment inspired by ideas described in \<^cite>\"baryossef2002"\. It is a KMV-type ($k$-minimum value) algorithm with a rounding method and matches the space complexity of the best algorithm described in \<^cite>\"baryossef2002"\. In addition to the Isabelle proof here, there is also an informal hand-written proof in Appendix~\ref{sec:f0_proof}.\ type_synonym f0_state = "nat \ nat \ nat \ nat \ (nat \ nat list) \ (nat \ float set)" definition hash where "hash p = ring.hash (mod_ring p)" fun f0_init :: "rat \ rat \ nat \ f0_state pmf" where "f0_init \ \ n = do { let s = nat \-18 * ln (real_of_rat \)\; let t = nat \80 / (real_of_rat \)\<^sup>2\; let p = prime_above (max n 19); let r = nat (4 * \log 2 (1 / real_of_rat \)\ + 23); h \ prod_pmf {.._. pmf_of_set (bounded_degree_polynomials (mod_ring p) 2)); return_pmf (s, t, p, r, h, (\_ \ {0.. f0_state \ f0_state pmf" where "f0_update x (s, t, p, r, h, sketch) = return_pmf (s, t, p, r, h, \i \ {.. rat pmf" where "f0_result (s, t, p, r, h, sketch) = return_pmf (median s (\i \ {.. rat \ rat) \ real" where "f0_space_usage (n, \, \) = ( let s = nat \-18 * ln (real_of_rat \)\ in let r = nat (4 * \log 2 (1 / real_of_rat \)\ + 23) in let t = nat \80 / (real_of_rat \)\<^sup>2 \ in 6 + 2 * log 2 (real s + 1) + 2 * log 2 (real t + 1) + 2 * log 2 (real n + 21) + 2 * log 2 (real r + 1) + real s * (5 + 2 * log 2 (21 + real n) + real t * (13 + 4 * r + 2 * log 2 (log 2 (real n + 13)))))" definition encode_f0_state :: "f0_state \ bool list option" where "encode_f0_state = N\<^sub>e \\<^sub>e (\s. N\<^sub>e \\<^sub>e ( N\<^sub>e \\<^sub>e (\p. N\<^sub>e \\<^sub>e ( ([0..\<^sub>e (P\<^sub>e p 2)) \\<^sub>e ([0..\<^sub>e (S\<^sub>e F\<^sub>e))))))" lemma "inj_on encode_f0_state (dom encode_f0_state)" proof - have "is_encoding encode_f0_state" unfolding encode_f0_state_def by (intro dependent_encoding exp_golomb_encoding poly_encoding fun_encoding set_encoding float_encoding) thus ?thesis by (rule encoding_imp_inj) qed context fixes \ \ :: rat fixes n :: nat fixes as :: "nat list" fixes result assumes \_range: "\ \ {0<..<1}" assumes \_range: "\ \ {0<..<1}" assumes as_range: "set as \ {.. fold (\a state. state \ f0_update a) as (f0_init \ \ n) \ f0_result" begin private definition t where "t = nat \80 / (real_of_rat \)\<^sup>2\" private lemma t_gt_0: "t > 0" using \_range by (simp add:t_def) private definition s where "s = nat \-(18 * ln (real_of_rat \))\" private lemma s_gt_0: "s > 0" using \_range by (simp add:s_def) private definition p where "p = prime_above (max n 19)" private lemma p_prime:"Factorial_Ring.prime p" using p_def prime_above_prime by presburger private lemma p_ge_18: "p \ 18" proof - have "p \ 19" by (metis p_def prime_above_lower_bound max.bounded_iff) thus ?thesis by simp qed private lemma p_gt_0: "p > 0" using p_ge_18 by simp private lemma p_gt_1: "p > 1" using p_ge_18 by simp private lemma n_le_p: "n \ p" proof - have "n \ max n 19" by simp also have "... \ p" unfolding p_def by (rule prime_above_lower_bound) finally show ?thesis by simp qed private lemma p_le_n: "p \ 2*n + 40" proof - have "p \ 2 * (max n 19) + 2" by (subst p_def, rule prime_above_upper_bound) also have "... \ 2 * n + 40" by (cases "n \ 19", auto) finally show ?thesis by simp qed private lemma as_lt_p: "\x. x \ set as \ x < p" using as_range atLeastLessThan_iff by (intro order_less_le_trans[OF _ n_le_p]) blast private lemma as_subset_p: "set as \ {..log 2 (1 / real_of_rat \)\ + 23)" private lemma r_bound: "4 * log 2 (1 / real_of_rat \) + 23 \ r" proof - have "0 \ log 2 (1 / real_of_rat \)" using \_range by simp hence "0 \ \log 2 (1 / real_of_rat \)\" by simp hence "0 \ 4 * \log 2 (1 / real_of_rat \)\ + 23" by (intro add_nonneg_nonneg mult_nonneg_nonneg, auto) thus ?thesis by (simp add:r_def) qed private lemma r_ge_23: "r \ 23" proof - have "(23::real) = 0 + 23" by simp also have "... \ 4 * log 2 (1 / real_of_rat \) + 23" using \_range by (intro add_mono mult_nonneg_nonneg, auto) also have "... \ r" using r_bound by simp finally show "23 \ r" by simp qed private lemma two_pow_r_le_1: "0 < 1 - 2 powr - real r" proof - have a: "2 powr (0::real) = 1" by simp show ?thesis using r_ge_23 by (simp, subst a[symmetric], intro powr_less_mono, auto) qed interpretation carter_wegman_hash_family "mod_ring p" 2 rewrites "ring.hash (mod_ring p) = Frequency_Moment_0.hash p" using carter_wegman_hash_familyI[OF mod_ring_is_field mod_ring_finite] using hash_def p_prime by auto private definition tr_hash where "tr_hash x \ = truncate_down r (hash x \)" private definition sketch_rv where "sketch_rv \ = least t ((\x. float_of (tr_hash x \)) ` set as)" private definition estimate where "estimate S = (if card S < t then of_nat (card S) else of_nat t * of_nat p / rat_of_float (Max S))" private definition sketch_rv' where "sketch_rv' \ = least t ((\x. tr_hash x \) ` set as)" private definition estimate' where "estimate' S = (if card S < t then real (card S) else real t * real p / Max S)" private definition \\<^sub>0 where "\\<^sub>0 = prod_pmf {.._. pmf_of_set space)" private lemma f0_alg_sketch: defines "sketch \ fold (\a state. state \ f0_update a) as (f0_init \ \ n)" shows "sketch = map_pmf (\x. (s,t,p,r, x, \i \ {..\<^sub>0" unfolding sketch_rv_def proof (subst sketch_def, induction as rule:rev_induct) case Nil then show ?case by (simp add:s_def p_def[symmetric] map_pmf_def t_def r_def Let_def least_def restrict_def space_def \\<^sub>0_def) next case (snoc x xs) let ?sketch = "\\ xs. least t ((\a. float_of (tr_hash a \)) ` set xs)" have "fold (\a state. state \ f0_update a) (xs @ [x]) (f0_init \ \ n) = (map_pmf (\\. (s, t, p, r, \, \i \ {.. i) xs)) \\<^sub>0) \ f0_update x" by (simp add: restrict_def snoc del:f0_init.simps) also have "... = \\<^sub>0 \ (\\. f0_update x (s, t, p, r, \, \i\{.. i) xs)) " by (simp add:map_pmf_def bind_assoc_pmf bind_return_pmf del:f0_update.simps) also have "... = map_pmf (\\. (s, t, p, r, \, \i\{.. i) (xs@[x]))) \\<^sub>0" by (simp add:least_insert map_pmf_def tr_hash_def cong:restrict_cong) finally show ?case by blast qed private lemma card_nat_in_ball: fixes x :: nat fixes q :: real assumes "q \ 0" defines "A \ {k. abs (real x - real k) \ q \ k \ x}" shows "real (card A) \ 2 * q" and "finite A" proof - have a: "of_nat x \ {\real x-q\..\real x+q\}" using assms by (simp add: ceiling_le_iff) have "card A = card (int ` A)" by (rule card_image[symmetric], simp) also have "... \ card ({\real x-q\..\real x+q\} - {of_nat x})" by (intro card_mono image_subsetI, simp_all add:A_def abs_le_iff, linarith) also have "... = card {\real x-q\..\real x+q\} - 1" by (rule card_Diff_singleton, rule a) also have "... = int (card {\real x-q\..\real x+q\}) - int 1" by (intro of_nat_diff) (metis a card_0_eq empty_iff finite_atLeastAtMost_int less_one linorder_not_le) also have "... \ \q+real x\+1 -\real x-q\ - 1" using assms by (simp, linarith) also have "... \ 2*q" by linarith finally show "card A \ 2 * q" by simp have "A \ {..x + nat \q\}" by (rule subsetI, simp add:A_def abs_le_iff, linarith) thus "finite A" by (rule finite_subset, simp) qed private lemma prob_degree_lt_1: "prob {\. degree \ < 1} \ 1/real p" proof - have "space \ {\. length \ \ Suc 0} = bounded_degree_polynomials (mod_ring p) 1" by (auto simp:set_eq_iff bounded_degree_polynomials_def space_def) moreover have "field_size = p" by (simp add:mod_ring_def) hence "real (card (bounded_degree_polynomials (mod_ring p) (Suc 0))) / real (card space) = 1 / real p" by (simp add:space_def bounded_degree_polynomials_card power2_eq_square) ultimately show ?thesis by (simp add:M_def measure_pmf_of_set) qed private lemma collision_prob: assumes "c \ 1" shows "prob {\. \x \ set as. \y \ set as. x \ y \ tr_hash x \ \ c \ tr_hash x \ = tr_hash y \} \ (5/2) * (real (card (set as)))\<^sup>2 * c\<^sup>2 * 2 powr -(real r) / (real p)\<^sup>2 + 1/real p" (is "prob {\. ?l \} \ ?r1 + ?r2") proof - define \ :: real where "\ = 9/8" have rho_c_ge_0: "\ * c \ 0" unfolding \_def using assms by simp have c_ge_0: "c\0" using assms by simp have "degree \ \ 1 \ \ \ space \ degree \ = 1" for \ by (simp add:bounded_degree_polynomials_def space_def) (metis One_nat_def Suc_1 le_less_Suc_eq less_imp_diff_less list.size(3) pos2) hence a: "\\ x y. x < p \ y < p \ x \ y \ degree \ \ 1 \ \ \ space \ hash x \ \ hash y \" using inj_onD[OF inj_if_degree_1] mod_ring_carr by blast have b: "prob {\. degree \ \ 1 \ tr_hash x \ \ c \ tr_hash x \ = tr_hash y \} \ 5 * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2" if b_assms: "x \ set as" "y \ set as" "x < y" for x y proof - have c: "real u \ \ * c \ \real u - real v\ \ \ * c * 2 powr (-real r)" if c_assms:"truncate_down r (real u) \ c" "truncate_down r (real u) = truncate_down r (real v)" for u v proof - have "9 * 2 powr - real r \ 9 * 2 powr (- real 23)" using r_ge_23 by (intro mult_left_mono powr_mono, auto) also have "... \ 1" by simp finally have "9 * 2 powr - real r \ 1" by simp hence "1 \ \ * (1 - 2 powr (- real r))" by (simp add:\_def) hence d: "(c*1) / (1 - 2 powr (-real r)) \ c * \" using assms two_pow_r_le_1 by (simp add: pos_divide_le_eq) have "\x. truncate_down r (real x) \ c \ real x * (1 - 2 powr - real r) \ c * 1" using truncate_down_pos[OF of_nat_0_le_iff] order_trans by (simp, blast) hence "\x. truncate_down r (real x) \ c \ real x \ c * \" using two_pow_r_le_1 by (intro order_trans[OF _ d], simp add: pos_le_divide_eq) hence e: "real u \ c * \" "real v \ c * \" using c_assms by auto have " \real u - real v\ \ (max \real u\ \real v\) * 2 powr (-real r)" using c_assms by (intro truncate_down_eq, simp) also have "... \ (c * \) * 2 powr (-real r)" using e by (intro mult_right_mono, auto) finally have "\real u - real v\ \ \ * c * 2 powr (-real r)" by (simp add:algebra_simps) thus ?thesis using e by (simp add:algebra_simps) qed have "prob {\. degree \ \ 1 \ tr_hash x \ \ c \ tr_hash x \ = tr_hash y \} \ prob (\ i \ {(u,v) \ {.. {.. v \ truncate_down r u \ c \ truncate_down r u = truncate_down r v}. {\. hash x \ = fst i \ hash y \ = snd i})" using a by (intro pmf_mono[OF M_def], simp add:tr_hash_def) (metis hash_range mod_ring_carr b_assms as_subset_p lessThan_iff nat_neq_iff subset_eq) also have "... \ (\ i\ {(u,v) \ {.. {.. v \ truncate_down r u \ c \ truncate_down r u = truncate_down r v}. prob {\. hash x \ = fst i \ hash y \ = snd i})" by (intro measure_UNION_le finite_cartesian_product finite_subset[where B="{0.. {0.. (\ i\ {(u,v) \ {.. {.. v \ truncate_down r u \ c \ truncate_down r u = truncate_down r v}. prob {\. (\u \ {x,y}. hash u \ = (if u = x then (fst i) else (snd i)))})" by (intro sum_mono pmf_mono[OF M_def]) force also have "... \ (\ i\ {(u,v) \ {.. {.. v \ truncate_down r u \ c \ truncate_down r u = truncate_down r v}. 1/(real p)\<^sup>2)" using assms as_subset_p b_assms by (intro sum_mono, subst hash_prob) (auto simp add: mod_ring_def power2_eq_square) also have "... = 1/(real p)\<^sup>2 * card {(u,v) \ {0.. {0.. v \ truncate_down r u \ c \ truncate_down r u = truncate_down r v}" by simp also have "... \ 1/(real p)\<^sup>2 * card {(u,v) \ {.. {.. v \ real u \ \ * c \ abs (real u - real v) \ \ * c * 2 powr (-real r)}" using c by (intro mult_mono of_nat_mono card_mono finite_cartesian_product finite_subset[where B="{..{.. 1/(real p)\<^sup>2 * card (\u' \ {u. u < p \ real u \ \ * c}. {(u::nat,v::nat). u = u' \ abs (real u - real v) \ \ * c * 2 powr (-real r) \ v < p \ v \ u'})" by (intro mult_left_mono of_nat_mono card_mono finite_cartesian_product finite_subset[where B="{..{.. 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. card {(u,v). u = u' \ abs (real u - real v) \ \ * c * 2 powr (-real r) \ v < p \ v \ u'})" by (intro mult_left_mono of_nat_mono card_UN_le, auto) also have "... = 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. card ((\x. (u' ,x)) ` {v. abs (real u' - real v) \ \ * c * 2 powr (-real r) \ v < p \ v \ u'}))" by (intro arg_cong2[where f="(*)"] arg_cong[where f="real"] sum.cong arg_cong[where f="card"]) (auto simp add:set_eq_iff) also have "... \ 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. card {v. abs (real u' - real v) \ \ * c * 2 powr (-real r) \ v < p \ v \ u'})" by (intro mult_left_mono of_nat_mono sum_mono card_image_le, auto) also have "... \ 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. card {v. abs (real u' - real v) \ \ * c * 2 powr (-real r) \ v \ u'})" by (intro mult_left_mono sum_mono of_nat_mono card_mono card_nat_in_ball subsetI) auto also have "... \ 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. real (card {v. abs (real u' - real v) \ \ * c * 2 powr (-real r) \ v \ u'}))" by simp also have "... \ 1/(real p)\<^sup>2 * (\ u' \ {u. u < p \ real u \ \ * c}. 2 * (\ * c * 2 powr (-real r)))" by (intro mult_left_mono sum_mono card_nat_in_ball(1), auto) also have "... = 1/(real p)\<^sup>2 * (real (card {u. u < p \ real u \ \ * c}) * (2 * (\ * c * 2 powr (-real r))))" by simp also have "... \ 1/(real p)\<^sup>2 * (real (card {u. u \ nat (\\ * c \)}) * (2 * (\ * c * 2 powr (-real r))))" using rho_c_ge_0 le_nat_floor by (intro mult_left_mono mult_right_mono of_nat_mono card_mono subsetI) auto also have "... \ 1/(real p)\<^sup>2 * ((1+\ * c) * (2 * (\ * c * 2 powr (-real r))))" using rho_c_ge_0 by (intro mult_left_mono mult_right_mono, auto) also have "... \ 1/(real p)\<^sup>2 * (((1+\) * c) * (2 * (\ * c * 2 powr (-real r))))" using assms by (intro mult_mono, auto simp add:distrib_left distrib_right \_def) also have "... = (\ * (2 + \ * 2)) * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2" by (simp add:ac_simps power2_eq_square) also have "... \ 5 * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2" by (intro divide_right_mono mult_right_mono) (auto simp add:\_def) finally show ?thesis by simp qed have "prob {\. ?l \ \ degree \ \ 1} \ prob (\ i \ {(x,y) \ (set as) \ (set as). x < y}. {\. degree \ \ 1 \ tr_hash (fst i) \ \ c \ tr_hash (fst i) \ = tr_hash (snd i) \})" by (rule pmf_mono[OF M_def], simp, metis linorder_neqE_nat) also have "... \ (\ i \ {(x,y) \ (set as) \ (set as). x < y}. prob {\. degree \ \ 1 \ tr_hash (fst i) \ \ c \ tr_hash (fst i) \ = tr_hash (snd i) \})" unfolding M_def by (intro measure_UNION_le finite_cartesian_product finite_subset[where B="(set as) \ (set as)"]) auto also have "... \ (\ i \ {(x,y) \ (set as) \ (set as). x < y}. 5 * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2)" using b by (intro sum_mono, simp add:case_prod_beta) also have "... = ((5/2) * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2) * (2 * card {(x,y) \ (set as) \ (set as). x < y})" by simp also have "... = ((5/2) * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2) * (card (set as) * (card (set as) - 1))" by (subst card_ordered_pairs, auto) also have "... \ ((5/2) * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2) * (real (card (set as)))\<^sup>2" by (intro mult_left_mono) (auto simp add:power2_eq_square mult_left_mono) also have "... = (5/2) * (real (card (set as)))\<^sup>2 * c\<^sup>2 * 2 powr (-real r) /(real p)\<^sup>2" by (simp add:algebra_simps) finally have f:"prob {\. ?l \ \ degree \ \ 1} \ ?r1" by simp have "prob {\. ?l \} \ prob {\. ?l \ \ degree \ \ 1} + prob {\. degree \ < 1}" by (rule pmf_add[OF M_def], auto) also have "... \ ?r1 + ?r2" by (intro add_mono f prob_degree_lt_1) finally show ?thesis by simp qed private lemma of_bool_square: "(of_bool x)\<^sup>2 = ((of_bool x)::real)" by (cases x, auto) private definition Q where "Q y \ = card {x \ set as. int (hash x \) < y}" private definition m where "m = card (set as)" private lemma assumes "a \ 0" assumes "a \ int p" shows exp_Q: "expectation (\\. real (Q a \)) = real m * (of_int a) / p" and var_Q: "variance (\\. real (Q a \)) \ real m * (of_int a) / p" proof - have exp_single: "expectation (\\. of_bool (int (hash x \) < a)) = real_of_int a /real p" if a:"x \ set as" for x proof - have x_le_p: "x < p" using a as_lt_p by simp have "expectation (\\. of_bool (int (hash x \) < a)) = expectation (indicat_real {\. int (Frequency_Moment_0.hash p x \) < a})" by (intro arg_cong2[where f="integral\<^sup>L"] ext, simp_all) also have "... = prob {\. hash x \ \ {k. int k < a}}" by (simp add:M_def) also have "... = card ({k. int k < a} \ {..\. of_bool (int (hash x \) < a)) = real_of_int a /real p" by simp qed have "expectation(\\. real (Q a \)) = expectation (\\. (\x \ set as. of_bool (int (hash x \) < a)))" by (simp add:Q_def Int_def) also have "... = (\x \ set as. expectation (\\. of_bool (int (hash x \) < a)))" by (rule Bochner_Integration.integral_sum, simp) also have "... = (\ x \ set as. a /real p)" by (rule sum.cong, simp, subst exp_single, simp, simp) also have "... = real m * real_of_int a / real p" by (simp add:m_def) finally show "expectation (\\. real (Q a \)) = real m * real_of_int a / p" by simp have indep: "J \ set as \ card J = 2 \ indep_vars (\_. borel) (\i x. of_bool (int (hash i x) < a)) J" for J using as_subset_p mod_ring_carr by (intro indep_vars_compose2[where Y="\i x. of_bool (int x < a)" and M'="\_. discrete"] k_wise_indep_vars_subset[OF k_wise_indep] finite_subset[OF _ finite_set]) auto have rv: "\x. x \ set as \ random_variable borel (\\. of_bool (int (hash x \) < a))" by (simp add:M_def) have "variance (\\. real (Q a \)) = variance (\\. (\x \ set as. of_bool (int (hash x \) < a)))" by (simp add:Q_def Int_def) also have "... = (\x \ set as. variance (\\. of_bool (int (hash x \) < a)))" by (intro var_sum_pairwise_indep_2 indep rv) auto also have "... \ (\ x \ set as. a / real p)" by (rule sum_mono, simp add: variance_eq of_bool_square, simp add: exp_single) also have "... = real m * real_of_int a /real p" by (simp add:m_def) finally show "variance (\\. real (Q a \)) \ real m * real_of_int a / p" by simp qed private lemma t_bound: "t \ 81 / (real_of_rat \)\<^sup>2" proof - have "t \ 80 / (real_of_rat \)\<^sup>2 + 1" using t_def t_gt_0 by linarith also have "... \ 80 / (real_of_rat \)\<^sup>2 + 1 / (real_of_rat \)\<^sup>2" using \_range by (intro add_mono, simp, simp add:power_le_one) also have "... = 81 / (real_of_rat \)\<^sup>2" by simp finally show ?thesis by simp qed private lemma t_r_bound: "18 * 40 * (real t)\<^sup>2 * 2 powr (-real r) \ 1" proof - have "720 * (real t)\<^sup>2 * 2 powr (-real r) \ 720 * (81 / (real_of_rat \)\<^sup>2)\<^sup>2 * 2 powr (-4 * log 2 (1 / real_of_rat \) - 23)" using r_bound t_bound by (intro mult_left_mono mult_mono power_mono powr_mono, auto) also have "... \ 720 * (81 / (real_of_rat \)\<^sup>2)\<^sup>2 * (2 powr (-4 * log 2 (1 / real_of_rat \)) * 2 powr (-23))" using \_range by (intro mult_left_mono mult_mono power_mono add_mono) (simp_all add:power_le_one powr_diff) also have "... = 720 * (81\<^sup>2 / (real_of_rat \)^4) * (2 powr (log 2 ((real_of_rat \)^4)) * 2 powr (-23))" using \_range by (intro arg_cong2[where f="(*)"]) (simp_all add:power2_eq_square power4_eq_xxxx log_divide log_powr[symmetric]) also have "... = 720 * 81\<^sup>2 * 2 powr (-23)" using \_range by simp also have "... \ 1" by simp finally show ?thesis by simp qed private lemma m_eq_F_0: "real m = of_rat (F 0 as)" by (simp add:m_def F_def) private lemma estimate'_bounds: "prob {\. of_rat \ * real_of_rat (F 0 as) < \estimate' (sketch_rv' \) - of_rat (F 0 as)\} \ 1/3" proof (cases "card (set as) \ t") case True define \' where "\' = 3 * real_of_rat \ / 4" define u where "u = \real t * p / (m * (1+\'))\" define v where "v = \real t * p / (m * (1-\'))\" define has_no_collision where "has_no_collision = (\\. \x\ set as. \y \ set as. (tr_hash x \ = tr_hash y \ \ x = y) \ tr_hash x \ > v)" have "2 powr (-real r) \ 2 powr (-(4 * log 2 (1 / real_of_rat \) + 23))" using r_bound by (intro powr_mono, linarith, simp) also have "... = 2 powr (-4 * log 2 (1 /real_of_rat \) -23)" by (rule arg_cong2[where f="(powr)"], auto simp add:algebra_simps) also have "... \ 2 powr ( -1 * log 2 (1 /real_of_rat \) -4)" using \_range by (intro powr_mono diff_mono, auto) also have "... = 2 powr ( -1 * log 2 (1 /real_of_rat \)) / 16" by (simp add: powr_diff) also have "... = real_of_rat \ / 16" using \_range by (simp add:log_divide) also have "... < real_of_rat \ / 8" using \_range by (subst pos_divide_less_eq, auto) finally have r_le_\: "2 powr (-real r) < real_of_rat \ / 8" by simp have \'_gt_0: "\' > 0" using \_range by (simp add:\'_def) have "\' < 3/4" using \_range by (simp add:\'_def)+ also have "... < 1" by simp finally have \'_lt_1: "\' < 1" by simp have "t \ 81 / (real_of_rat \)\<^sup>2" using t_bound by simp also have "... = (81*9/16) / (\')\<^sup>2" by (simp add:\'_def power2_eq_square) also have "... \ 46 / \'\<^sup>2" by (intro divide_right_mono, simp, simp) finally have t_le_\': "t \ 46/ \'\<^sup>2" by simp have "80 \ (real_of_rat \)\<^sup>2 * (80 / (real_of_rat \)\<^sup>2)" using \_range by simp also have "... \ (real_of_rat \)\<^sup>2 * t" by (intro mult_left_mono, simp add:t_def of_nat_ceiling, simp) finally have "80 \ (real_of_rat \)\<^sup>2 * t" by simp hence t_ge_\': "45 \ t * \' * \'" by (simp add:\'_def power2_eq_square) have "m \ card {.. p" using n_le_p by simp finally have m_le_p: "m \ p" by simp hence t_le_m: "t \ card (set as)" using True by simp have m_ge_0: "real m > 0" using m_def True t_gt_0 by simp have "v \ real t * real p / (real m * (1 - \'))" by (simp add:v_def) also have "... \ real t * real p / (real m * (1/4))" using \'_lt_1 m_ge_0 \_range by (intro divide_left_mono mult_left_mono mult_nonneg_nonneg mult_pos_pos, simp_all add:\'_def) finally have v_ubound: "v \ 4 * real t * real p / real m" by (simp add:algebra_simps) have a_ge_1: "u \ 1" using \'_gt_0 p_gt_0 m_ge_0 t_gt_0 by (auto intro!:mult_pos_pos divide_pos_pos simp add:u_def) hence a_ge_0: "u \ 0" by simp have "real m * (1 - \') < real m" using \'_gt_0 m_ge_0 by simp also have "... \ 1 * real p" using m_le_p by simp also have "... \ real t * real p" using t_gt_0 by (intro mult_right_mono, auto) finally have " real m * (1 - \') < real t * real p" by simp hence v_gt_0: "v > 0" using mult_pos_pos m_ge_0 \'_lt_1 by (simp add:v_def) hence v_ge_1: "real_of_int v \ 1" by linarith have "real t \ real m" using True m_def by linarith also have "... < (1 + \') * real m" using \'_gt_0 m_ge_0 by force finally have a_le_p_aux: "real t < (1 + \') * real m" by simp have "u \ real t * real p / (real m * (1 + \'))+1" by (simp add:u_def) also have "... < real p + 1" using m_ge_0 \'_gt_0 a_le_p_aux a_le_p_aux p_gt_0 by (simp add: pos_divide_less_eq ac_simps) finally have "u \ real p" by (metis int_less_real_le not_less of_int_le_iff of_int_of_nat_eq) hence u_le_p: "u \ int p" by linarith have "prob {\. Q u \ \ t} \ prob {\ \ Sigma_Algebra.space M. abs (real (Q u \) - expectation (\\. real (Q u \))) \ 3 * sqrt (m * real_of_int u / p)}" proof (rule pmf_mono[OF M_def]) fix \ assume "\ \ {\. t \ Q u \}" hence t_le: "t \ Q u \" by simp have "real m * real_of_int u / real p \ real m * (real t * real p / (real m * (1 + \'))+1) / real p" using m_ge_0 p_gt_0 by (intro divide_right_mono mult_left_mono, simp_all add: u_def) also have "... = real m * real t * real p / (real m * (1+\') * real p) + real m / real p" by (simp add:distrib_left add_divide_distrib) also have "... = real t / (1+\') + real m / real p" using p_gt_0 m_ge_0 by simp also have "... \ real t / (1+\') + 1" using m_le_p p_gt_0 by (intro add_mono, auto) finally have "real m * real_of_int u / real p \ real t / (1 + \') + 1" by simp hence "3 * sqrt (real m * of_int u / real p) + real m * of_int u / real p \ 3 * sqrt (t / (1+\')+1)+(t/(1+\')+1)" by (intro add_mono mult_left_mono real_sqrt_le_mono, auto) also have "... \ 3 * sqrt (real t+1) + ((t * (1 - \' / (1+\'))) + 1)" using \'_gt_0 t_gt_0 by (intro add_mono mult_left_mono real_sqrt_le_mono) (simp_all add: pos_divide_le_eq left_diff_distrib) also have "... = 3 * sqrt (real t+1) + (t - \' * t / (1+\')) + 1" by (simp add:algebra_simps) also have "... \ 3 * sqrt (46 / \'\<^sup>2 + 1 / \'\<^sup>2) + (t - \' * t/2) + 1 / \'" using \'_gt_0 t_gt_0 \'_lt_1 add_pos_pos t_le_\' by (intro add_mono mult_left_mono real_sqrt_le_mono add_mono) (simp_all add: power_le_one pos_le_divide_eq) also have "... \ (21 / \' + (t - 45 / (2*\'))) + 1 / \'" using \'_gt_0 t_ge_\' by (intro add_mono) (simp_all add:real_sqrt_divide divide_le_cancel real_le_lsqrt pos_divide_le_eq ac_simps) also have "... \ t" using \'_gt_0 by simp also have "... \ Q u \" using t_le by simp finally have "3 * sqrt (real m * of_int u / real p) + real m * of_int u / real p \ Q u \" by simp hence " 3 * sqrt (real m * real_of_int u / real p) \ \real (Q u \) - expectation (\\. real (Q u \))\" using a_ge_0 u_le_p True by (simp add:exp_Q abs_ge_iff) thus "\ \ {\ \ Sigma_Algebra.space M. 3 * sqrt (real m * real_of_int u / real p) \ \real (Q u \) - expectation (\\. real (Q u \))\}" by (simp add: M_def) qed also have "... \ variance (\\. real (Q u \)) / (3 * sqrt (real m * of_int u / real p))\<^sup>2" using a_ge_1 p_gt_0 m_ge_0 by (intro Chebyshev_inequality, simp add:M_def, auto) also have "... \ (real m * real_of_int u / real p) / (3 * sqrt (real m * of_int u / real p))\<^sup>2" using a_ge_0 u_le_p by (intro divide_right_mono var_Q, auto) also have "... \ 1/9" using a_ge_0 by simp finally have case_1: "prob {\. Q u \ \ t} \ 1/9" by simp have case_2: "prob {\. Q v \ < t} \ 1/9" proof (cases "v \ p") case True have "prob {\. Q v \ < t} \ prob {\ \ Sigma_Algebra.space M. abs (real (Q v \) - expectation (\\. real (Q v \))) \ 3 * sqrt (m * real_of_int v / p)}" proof (rule pmf_mono[OF M_def]) fix \ assume "\ \ set_pmf (pmf_of_set space)" have "(real t + 3 * sqrt (real t / (1 - \') )) * (1 - \') = real t - \' * t + 3 * ((1-\') * sqrt( real t / (1-\') ))" by (simp add:algebra_simps) also have "... = real t - \' * t + 3 * sqrt ( (1-\')\<^sup>2 * (real t / (1-\')))" using \'_lt_1 by (subst real_sqrt_mult, simp) also have "... = real t - \' * t + 3 * sqrt ( real t * (1- \'))" by (simp add:power2_eq_square distrib_left) also have "... \ real t - 45/ \' + 3 * sqrt ( real t )" using \'_gt_0 t_ge_\' \'_lt_1 by (intro add_mono mult_left_mono real_sqrt_le_mono) (simp_all add:pos_divide_le_eq ac_simps left_diff_distrib power_le_one) also have "... \ real t - 45/ \' + 3 * sqrt ( 46 / \'\<^sup>2)" using t_le_\' \'_lt_1 \'_gt_0 by (intro add_mono mult_left_mono real_sqrt_le_mono, simp_all add:pos_divide_le_eq power_le_one) also have "... = real t + (3 * sqrt(46) - 45)/ \'" using \'_gt_0 by (simp add:real_sqrt_divide diff_divide_distrib) also have "... \ t" using \'_gt_0 by (simp add:pos_divide_le_eq real_le_lsqrt) finally have aux: "(real t + 3 * sqrt (real t / (1 - \'))) * (1 - \') \ real t " by simp assume "\ \ {\. Q v \ < t}" hence "Q v \ < t" by simp hence "real (Q v \) + 3 * sqrt (real m * real_of_int v / real p) \ real t - 1 + 3 * sqrt (real m * real_of_int v / real p)" using m_le_p p_gt_0 by (intro add_mono, auto simp add: algebra_simps add_divide_distrib) also have "... \ (real t-1) + 3 * sqrt (real m * (real t * real p / (real m * (1- \'))) / real p)" by (intro add_mono mult_left_mono real_sqrt_le_mono divide_right_mono) (auto simp add:v_def) also have "... \ real t + 3 * sqrt(real t / (1-\')) - 1" using m_ge_0 p_gt_0 by simp also have "... \ real t / (1-\')-1" using \'_lt_1 aux by (simp add: pos_le_divide_eq) also have "... \ real m * (real t * real p / (real m * (1-\'))) / real p - 1" using p_gt_0 m_ge_0 by simp also have "... \ real m * (real t * real p / (real m * (1-\'))) / real p - real m / real p" using m_le_p p_gt_0 by (intro diff_mono, auto) also have "... = real m * (real t * real p / (real m * (1-\'))-1) / real p" by (simp add: left_diff_distrib right_diff_distrib diff_divide_distrib) also have "... \ real m * real_of_int v / real p" by (intro divide_right_mono mult_left_mono, simp_all add:v_def) finally have "real (Q v \) + 3 * sqrt (real m * real_of_int v / real p) \ real m * real_of_int v / real p" by simp hence " 3 * sqrt (real m * real_of_int v / real p) \ \real (Q v \) -expectation (\\. real (Q v \))\" using v_gt_0 True by (simp add: exp_Q abs_ge_iff) thus "\ \ {\\ Sigma_Algebra.space M. 3 * sqrt (real m * real_of_int v / real p) \ \real (Q v \) - expectation (\\. real (Q v \))\}" by (simp add:M_def) qed also have "... \ variance (\\. real (Q v \)) / (3 * sqrt (real m * real_of_int v / real p))\<^sup>2" using v_gt_0 p_gt_0 m_ge_0 by (intro Chebyshev_inequality, simp add:M_def, auto) also have "... \ (real m * real_of_int v / real p) / (3 * sqrt (real m * real_of_int v / real p))\<^sup>2" using v_gt_0 True by (intro divide_right_mono var_Q, auto) also have "... = 1/9" using p_gt_0 v_gt_0 m_ge_0 by (simp add:power2_eq_square) finally show ?thesis by simp next case False have "prob {\. Q v \ < t} \ prob {\. False}" proof (rule pmf_mono[OF M_def]) fix \ assume a:"\ \ {\. Q v \ < t}" assume "\ \ set_pmf (pmf_of_set space)" hence b:"\x. x < p \ hash x \ < p" using hash_range mod_ring_carr by (simp add:M_def measure_pmf_inverse) have "t \ card (set as)" using True by simp also have "... \ Q v \" unfolding Q_def using b False as_lt_p by (intro card_mono subsetI, simp, force) also have "... < t" using a by simp finally have "False" by auto thus "\ \ {\. False}" by simp qed also have "... = 0" by auto finally show ?thesis by simp qed have "prob {\. \has_no_collision \} \ prob {\. \x \ set as. \y \ set as. x \ y \ tr_hash x \ \ real_of_int v \ tr_hash x \ = tr_hash y \}" by (rule pmf_mono[OF M_def]) (simp add:has_no_collision_def M_def, force) also have "... \ (5/2) * (real (card (set as)))\<^sup>2 * (real_of_int v)\<^sup>2 * 2 powr - real r / (real p)\<^sup>2 + 1 / real p" using collision_prob v_ge_1 by blast also have "... \ (5/2) * (real m)\<^sup>2 * (real_of_int v)\<^sup>2 * 2 powr - real r / (real p)\<^sup>2 + 1 / real p" by (intro divide_right_mono add_mono mult_right_mono mult_mono power_mono, simp_all add:m_def) also have "... \ (5/2) * (real m)\<^sup>2 * (4 * real t * real p / real m)\<^sup>2 * (2 powr - real r) / (real p)\<^sup>2 + 1 / real p" using v_def v_ge_1 v_ubound by (intro add_mono divide_right_mono mult_right_mono mult_left_mono, auto) also have "... = 40 * (real t)\<^sup>2 * (2 powr -real r) + 1 / real p" using p_gt_0 m_ge_0 t_gt_0 by (simp add:algebra_simps power2_eq_square) also have "... \ 1/18 + 1/18" using t_r_bound p_ge_18 by (intro add_mono, simp_all add: pos_le_divide_eq) also have "... = 1/9" by simp finally have case_3: "prob {\. \has_no_collision \} \ 1/9" by simp have "prob {\. real_of_rat \ * of_rat (F 0 as) < \estimate' (sketch_rv' \) - of_rat (F 0 as)\} \ prob {\. Q u \ \ t \ Q v \ < t \ \(has_no_collision \)}" proof (rule pmf_mono[OF M_def], rule ccontr) fix \ assume "\ \ set_pmf (pmf_of_set space)" assume "\ \ {\. real_of_rat \ * real_of_rat (F 0 as) < \estimate' (sketch_rv' \) - real_of_rat (F 0 as)\}" hence est: "real_of_rat \ * real_of_rat (F 0 as) < \estimate' (sketch_rv' \) - real_of_rat (F 0 as)\" by simp assume "\ \ {\. t \ Q u \ \ Q v \ < t \ \ has_no_collision \}" hence "\( t \ Q u \ \ Q v \ < t \ \ has_no_collision \)" by simp hence lb: "Q u \ < t" and ub: "Q v \ \ t" and no_col: "has_no_collision \" by simp+ define y where "y = nth_mset (t-1) {#int (hash x \). x \# mset_set (set as)#}" define y' where "y' = nth_mset (t-1) {#tr_hash x \. x \# mset_set (set as)#}" have rank_t_lb: "u \ y" unfolding y_def using True t_gt_0 lb by (intro nth_mset_bound_left, simp_all add:count_less_def swap_filter_image Q_def) have rank_t_ub: "y \ v - 1" unfolding y_def using True t_gt_0 ub by (intro nth_mset_bound_right, simp_all add:Q_def swap_filter_image count_le_def) have y_ge_0: "real_of_int y \ 0" using rank_t_lb a_ge_0 by linarith have "mono (\x. truncate_down r (real_of_int x))" by (metis truncate_down_mono mono_def of_int_le_iff) hence y'_eq: "y' = truncate_down r y" unfolding y_def y'_def using True t_gt_0 by (subst nth_mset_commute_mono[where f="(\x. truncate_down r (of_int x))"]) (simp_all add: multiset.map_comp comp_def tr_hash_def) have "real_of_int u * (1 - 2 powr -real r) \ real_of_int y * (1 - 2 powr (-real r))" using rank_t_lb of_int_le_iff two_pow_r_le_1 by (intro mult_right_mono, auto) also have "... \ y'" using y'_eq truncate_down_pos[OF y_ge_0] by simp finally have rank_t_lb': "u * (1 - 2 powr -real r) \ y'" by simp have "y' \ real_of_int y" by (subst y'_eq, rule truncate_down_le, simp) also have "... \ real_of_int (v-1)" using rank_t_ub of_int_le_iff by blast finally have rank_t_ub': "y' \ v-1" by simp have "0 < u * (1-2 powr -real r)" using a_ge_1 two_pow_r_le_1 by (intro mult_pos_pos, auto) hence y'_pos: "y' > 0" using rank_t_lb' by linarith have no_col': "\x. x \ y' \ count {#tr_hash x \. x \# mset_set (set as)#} x \ 1" using rank_t_ub' no_col by (simp add:vimage_def card_le_Suc0_iff_eq count_image_mset has_no_collision_def) force have h_1: "Max (sketch_rv' \) = y'" using True t_gt_0 no_col' by (simp add:sketch_rv'_def y'_def nth_mset_max) have "card (sketch_rv' \) = card (least ((t-1)+1) (set_mset {#tr_hash x \. x \# mset_set (set as)#}))" using t_gt_0 by (simp add:sketch_rv'_def) also have "... = (t-1) +1" using True t_gt_0 no_col' by (intro nth_mset_max(2), simp_all add:y'_def) also have "... = t" using t_gt_0 by simp finally have "card (sketch_rv' \) = t" by simp hence h_3: "estimate' (sketch_rv' \) = real t * real p / y'" using h_1 by (simp add:estimate'_def) have "(real t) * real p \ (1 + \') * real m * ((real t) * real p / (real m * (1 + \')))" using \'_lt_1 m_def True t_gt_0 \'_gt_0 by auto also have "... \ (1+\') * m * u" using \'_gt_0 by (intro mult_left_mono, simp_all add:u_def) also have "... < ((1 + real_of_rat \)*(1-real_of_rat \/8)) * m * u" using True m_def t_gt_0 a_ge_1 \_range by (intro mult_strict_right_mono, auto simp add:\'_def right_diff_distrib) also have "... \ ((1 + real_of_rat \)*(1-2 powr (-r))) * m * u" using r_le_\ \_range a_ge_0 by (intro mult_right_mono mult_left_mono, auto) also have "... = (1 + real_of_rat \) * m * (u * (1-2 powr -real r))" by simp also have "... \ (1 + real_of_rat \) * m * y'" using \_range by (intro mult_left_mono rank_t_lb', simp) finally have "real t * real p < (1 + real_of_rat \) * m * y'" by simp hence f_1: "estimate' (sketch_rv' \) < (1 + real_of_rat \) * m" using y'_pos by (simp add: h_3 pos_divide_less_eq) have "(1 - real_of_rat \) * m * y' \ (1 - real_of_rat \) * m * v" using \_range rank_t_ub' y'_pos by (intro mult_mono rank_t_ub', simp_all) also have "... = (1-real_of_rat \) * (real m * v)" by simp also have "... < (1-\') * (real m * v)" using \_range m_ge_0 v_ge_1 by (intro mult_strict_right_mono mult_pos_pos, simp_all add:\'_def) also have "... \ (1-\') * (real m * (real t * real p / (real m * (1-\'))))" using \'_gt_0 \'_lt_1 by (intro mult_left_mono, auto simp add:v_def) also have "... = real t * real p" using \'_gt_0 \'_lt_1 t_gt_0 p_gt_0 m_ge_0 by auto finally have "(1 - real_of_rat \) * m * y' < real t * real p" by simp hence f_2: "estimate' (sketch_rv' \) > (1 - real_of_rat \) * m" using y'_pos by (simp add: h_3 pos_less_divide_eq) have "abs (estimate' (sketch_rv' \) - real_of_rat (F 0 as)) < real_of_rat \ * (real_of_rat (F 0 as))" using f_1 f_2 by (simp add:abs_less_iff algebra_simps m_eq_F_0) thus "False" using est by linarith qed also have "... \ 1/9 + (1/9 + 1/9)" by (intro pmf_add_2[OF M_def] case_1 case_2 case_3) also have "... = 1/3" by simp finally show ?thesis by simp next case False have "prob {\. real_of_rat \ * of_rat (F 0 as) < \estimate' (sketch_rv' \) - of_rat (F 0 as)\} \ prob {\. \x \ set as. \y \ set as. x \ y \ tr_hash x \ \ real p \ tr_hash x \ = tr_hash y \}" proof (rule pmf_mono[OF M_def]) fix \ assume a:"\ \ {\. real_of_rat \ * real_of_rat (F 0 as) < \estimate' (sketch_rv' \) - real_of_rat (F 0 as)\}" assume b:"\ \ set_pmf (pmf_of_set space)" have c: "card (set as) < t" using False by auto hence "card ((\x. tr_hash x \) ` set as) < t" using card_image_le order_le_less_trans by blast hence d:"card (sketch_rv' \) = card ((\x. tr_hash x \) ` (set as))" by (simp add:sketch_rv'_def card_least) have "card (sketch_rv' \) < t" by (metis List.finite_set c d card_image_le order_le_less_trans) hence "estimate' (sketch_rv' \) = card (sketch_rv' \)" by (simp add:estimate'_def) hence "card (sketch_rv' \) \ real_of_rat (F 0 as)" using a \_range by simp (metis abs_zero cancel_comm_monoid_add_class.diff_cancel of_nat_less_0_iff pos_prod_lt zero_less_of_rat_iff) hence "card (sketch_rv' \) \ card (set as)" using m_def m_eq_F_0 by linarith hence "\inj_on (\x. tr_hash x \) (set as)" using card_image d by auto moreover have "tr_hash x \ \ real p" if a:"x \ set as" for x proof - have "hash x \ < p" using hash_range as_lt_p a b by (simp add:mod_ring_carr M_def) thus "tr_hash x \ \ real p" using truncate_down_le by (simp add:tr_hash_def) qed ultimately show "\ \ {\. \x \ set as. \y \ set as. x \ y \ tr_hash x \ \ real p \ tr_hash x \ = tr_hash y \}" by (simp add:inj_on_def, blast) qed also have "... \ (5/2) * (real (card (set as)))\<^sup>2 * (real p)\<^sup>2 * 2 powr - real r / (real p)\<^sup>2 + 1 / real p" using p_gt_0 by (intro collision_prob, auto) also have "... = (5/2) * (real (card (set as)))\<^sup>2 * 2 powr (- real r) + 1 / real p" using p_gt_0 by (simp add:ac_simps power2_eq_square) also have "... \ (5/2) * (real t)\<^sup>2 * 2 powr (-real r) + 1 / real p" using False by (intro add_mono mult_right_mono mult_left_mono power_mono, auto) also have "... \ 1/6 + 1/6" using t_r_bound p_ge_18 by (intro add_mono, simp_all) also have "... \ 1/3" by simp finally show ?thesis by simp qed private lemma median_bounds: "\

(\ in measure_pmf \\<^sub>0. \median s (\i. estimate (sketch_rv (\ i))) - F 0 as\ \ \ * F 0 as) \ 1 - real_of_rat \" proof - have "strict_mono_on A real_of_float" for A by (meson less_float.rep_eq strict_mono_onI) hence real_g_2: "\\. sketch_rv' \ = real_of_float ` sketch_rv \" by (simp add: sketch_rv'_def sketch_rv_def tr_hash_def least_mono_commute image_comp) moreover have "inj_on real_of_float A" for A using real_of_float_inject by (simp add:inj_on_def) ultimately have card_eq: "\\. card (sketch_rv \) = card (sketch_rv' \)" using real_g_2 by (auto intro!: card_image[symmetric]) have "Max (sketch_rv' \) = real_of_float (Max (sketch_rv \))" if a:"card (sketch_rv' \) \ t" for \ proof - have "mono real_of_float" using less_eq_float.rep_eq mono_def by blast moreover have "finite (sketch_rv \)" by (simp add:sketch_rv_def least_def) moreover have " sketch_rv \ \ {}" using card_eq[symmetric] card_gt_0_iff t_gt_0 a by (simp, force) ultimately show ?thesis by (subst mono_Max_commute[where f="real_of_float"], simp_all add:real_g_2) qed hence real_g: "\\. estimate' (sketch_rv' \) = real_of_rat (estimate (sketch_rv \))" by (simp add:estimate_def estimate'_def card_eq of_rat_divide of_rat_mult of_rat_add real_of_rat_of_float) have indep: "prob_space.indep_vars (measure_pmf \\<^sub>0) (\_. borel) (\i \. estimate' (sketch_rv' (\ i))) {0..\<^sub>0_def by (rule indep_vars_restrict_intro', auto simp add:restrict_dfl_def lessThan_atLeast0) moreover have "- (18 * ln (real_of_rat \)) \ real s" using of_nat_ceiling by (simp add:s_def) blast moreover have "i < s \ measure \\<^sub>0 {\. of_rat \ * of_rat (F 0 as) < \estimate' (sketch_rv' (\ i)) - of_rat (F 0 as)\} \ 1/3" for i using estimate'_bounds unfolding \\<^sub>0_def M_def by (subst prob_prod_pmf_slice, simp_all) ultimately have "1-real_of_rat \ \ \

(\ in measure_pmf \\<^sub>0. \median s (\i. estimate' (sketch_rv' (\ i))) - real_of_rat (F 0 as)\ \ real_of_rat \ * real_of_rat (F 0 as))" using \_range prob_space_measure_pmf by (intro prob_space.median_bound_2) auto also have "... = \

(\ in measure_pmf \\<^sub>0. \median s (\i. estimate (sketch_rv (\ i))) - F 0 as\ \ \ * F 0 as)" using s_gt_0 median_rat[symmetric] real_g by (intro arg_cong2[where f="measure"]) (simp_all add:of_rat_diff[symmetric] of_rat_mult[symmetric] of_rat_less_eq) finally show "\

(\ in measure_pmf \\<^sub>0. \median s (\i. estimate (sketch_rv (\ i))) - F 0 as\ \ \ * F 0 as) \ 1-real_of_rat \" by blast qed lemma f0_alg_correct': "\

(\ in measure_pmf result. \\ - F 0 as\ \ \ * F 0 as) \ 1 - of_rat \" proof - have f0_result_elim: "\x. f0_result (s, t, p, r, x, \i\{..i. estimate (sketch_rv (x i))))" by (simp add:estimate_def, rule median_cong, simp) have "result = map_pmf (\x. (s, t, p, r, x, \i\{..\<^sub>0 \ f0_result" by (subst result_def, subst f0_alg_sketch, simp) also have "... = \\<^sub>0 \ (\x. return_pmf (s, t, p, r, x, \i\{.. f0_result" by (simp add:t_def p_def r_def s_def map_pmf_def) also have "... = \\<^sub>0 \ (\x. return_pmf (median s (\i. estimate (sketch_rv (x i)))))" by (subst bind_assoc_pmf, subst bind_return_pmf, subst f0_result_elim) simp finally have a:"result = \\<^sub>0 \ (\x. return_pmf (median s (\i. estimate (sketch_rv (x i)))))" by simp show ?thesis using median_bounds by (simp add: a map_pmf_def[symmetric]) qed private lemma f_subset: assumes "g ` A \ h ` B" shows "(\x. f (g x)) ` A \ (\x. f (h x)) ` B" using assms by auto lemma f0_exact_space_usage': defines "\ \ fold (\a state. state \ f0_update a) as (f0_init \ \ n)" shows "AE \ in \. bit_count (encode_f0_state \) \ f0_space_usage (n, \, \)" proof - have log_2_4: "log 2 4 = 2" by (metis log2_of_power_eq mult_2 numeral_Bit0 of_nat_numeral power2_eq_square) have a: "bit_count (F\<^sub>e (float_of (truncate_down r y))) \ ereal (12 + 4 * real r + 2 * log 2 (log 2 (n+13)))" if a_1:"y \ {.. 1") case True have aux_1: " 0 < 2 + log 2 (real y)" using True by (intro add_pos_nonneg, auto) have aux_2: "0 < 2 + log 2 (real p)" using p_gt_1 by (intro add_pos_nonneg, auto) have "bit_count (F\<^sub>e (float_of (truncate_down r y))) \ ereal (10 + 4 * real r + 2 * log 2 (2 + \log 2 \real y\\))" by (rule truncate_float_bit_count) also have "... = ereal (10 + 4 * real r + 2 * log 2 (2 + (log 2 (real y))))" using True by simp also have "... \ ereal (10 + 4 * real r + 2 * log 2 (2 + log 2 p))" using aux_1 aux_2 True p_gt_0 a_1 by simp also have "... \ ereal (10 + 4 * real r + 2 * log 2 (log 2 4 + log 2 (2 * n + 40)))" using log_2_4 p_le_n p_gt_0 by (intro ereal_mono add_mono mult_left_mono log_mono of_nat_mono add_pos_nonneg, auto) also have "... = ereal (10 + 4 * real r + 2 * log 2 (log 2 (8 * n + 160)))" by (simp add:log_mult[symmetric]) also have "... \ ereal (10 + 4 * real r + 2 * log 2 (log 2 ((n+13) powr 2)))" by (intro ereal_mono add_mono mult_left_mono log_mono of_nat_mono add_pos_nonneg) (auto simp add:power2_eq_square algebra_simps) also have "... = ereal (10 + 4 * real r + 2 * log 2 (log 2 4 * log 2 (n + 13)))" by (subst log_powr, simp_all add:log_2_4) also have "... = ereal (12 + 4 * real r + 2 * log 2 (log 2 (n + 13)))" by (subst log_mult, simp_all add:log_2_4) finally show ?thesis by simp next case False hence "y = 0" using a_1 by simp then show ?thesis by (simp add:float_bit_count_zero) qed have "bit_count (encode_f0_state (s, t, p, r, x, \i\{.. f0_space_usage (n, \, \)" if b: "x \ {..\<^sub>E space" for x proof - have c: "x \ extensional {.. (\k. float_of (truncate_down r k)) ` {.. (\xa. float_of (truncate_down r (hash xa (x y)))) ` set as" using least_subset by (auto simp add:sketch_rv_def tr_hash_def) also have "... \ (\k. float_of (truncate_down r (real k))) ` {..y. y < s \ finite (sketch_rv (x y))" unfolding sketch_rv_def by (rule finite_subset[OF least_subset], simp) moreover have card_sketch: "\y. y < s \ card (sketch_rv (x y)) \ t " by (simp add:sketch_rv_def card_least) moreover have "\y z. y < s \ z \ sketch_rv (x y) \ bit_count (F\<^sub>e z) \ ereal (12 + 4 * real r + 2 * log 2 (log 2 (real n + 13)))" using a d by auto ultimately have e: "\y. y < s \ bit_count (S\<^sub>e F\<^sub>e (sketch_rv (x y))) \ ereal (real t) * (ereal (12 + 4 * real r + 2 * log 2 (log 2 (real (n + 13)))) + 1) + 1" using float_encoding by (intro set_bit_count_est, auto) have f: "\y. y < s \ bit_count (P\<^sub>e p 2 (x y)) \ ereal (real 2 * (log 2 (real p) + 1))" using p_gt_1 b by (intro bounded_degree_polynomial_bit_count) (simp_all add:space_def PiE_def Pi_def) have "bit_count (encode_f0_state (s, t, p, r, x, \i\{..e s) + bit_count (N\<^sub>e t) + bit_count (N\<^sub>e p) + bit_count (N\<^sub>e r) + bit_count (([0..\<^sub>e P\<^sub>e p 2) x) + bit_count (([0..\<^sub>e S\<^sub>e F\<^sub>e) (\i\{.. ereal (2* log 2 (real s + 1) + 1) + ereal (2* log 2 (real t + 1) + 1) + ereal (2* log 2 (real p + 1) + 1) + ereal (2 * log 2 (real r + 1) + 1) + (ereal (real s) * (ereal (real 2 * (log 2 (real p) + 1)))) + (ereal (real s) * ((ereal (real t) * (ereal (12 + 4 * real r + 2 * log 2 (log 2 (real (n + 13)))) + 1) + 1)))" using c e f by (intro add_mono exp_golomb_bit_count fun_bit_count_est[where xs="[0.. ereal ( 4 + 2 * log 2 (real s + 1) + 2 * log 2 (real t + 1) + 2 * log 2 (2 * (21 + real n)) + 2 * log 2 (real r + 1) + real s * (3 + 2 * log 2 (2 * (21 + real n)) + real t * (13 + (4 * real r + 2 * log 2 (log 2 (real n + 13))))))" using p_le_n p_gt_0 by (intro ereal_mono add_mono mult_left_mono, auto) also have "... = ereal (6 + 2 * log 2 (real s + 1) + 2 * log 2 (real t + 1) + 2 * log 2 (21 + real n) + 2 * log 2 (real r + 1) + real s * (5 + 2 * log 2 (21 + real n) + real t * (13 + (4 * real r + 2 * log 2 (log 2 (real n + 13))))))" by (subst (1 2) log_mult, auto) also have "... \ f0_space_usage (n, \, \)" by (simp add:s_def[symmetric] r_def[symmetric] t_def[symmetric] Let_def) (simp add:algebra_simps) finally show "bit_count (encode_f0_state (s, t, p, r, x, \i\{.. f0_space_usage (n, \, \)" by simp qed hence "\x. x \ set_pmf \\<^sub>0 \ bit_count (encode_f0_state (s, t, p, r, x, \i\{.. ereal (f0_space_usage (n, \, \))" by (simp add:\\<^sub>0_def set_prod_pmf del:f0_space_usage.simps) hence "\y. y \ set_pmf \ \ bit_count (encode_f0_state y) \ ereal (f0_space_usage (n, \, \))" by (simp add: \_def f0_alg_sketch del:f0_space_usage.simps f0_init.simps) (metis (no_types, lifting) image_iff pmf.set_map) thus ?thesis by (simp add: AE_measure_pmf_iff del:f0_space_usage.simps) qed end text \Main results of this section:\ theorem f0_alg_correct: assumes "\ \ {0<..<1}" assumes "\ \ {0<..<1}" assumes "set as \ {.. \ fold (\a state. state \ f0_update a) as (f0_init \ \ n) \ f0_result" shows "\

(\ in measure_pmf \. \\ - F 0 as\ \ \ * F 0 as) \ 1 - of_rat \" using f0_alg_correct'[OF assms(1-3)] unfolding \_def by blast theorem f0_exact_space_usage: assumes "\ \ {0<..<1}" assumes "\ \ {0<..<1}" assumes "set as \ {.. \ fold (\a state. state \ f0_update a) as (f0_init \ \ n)" shows "AE \ in \. bit_count (encode_f0_state \) \ f0_space_usage (n, \, \)" using f0_exact_space_usage'[OF assms(1-3)] unfolding \_def by blast theorem f0_asymptotic_space_complexity: "f0_space_usage \ O[at_top \\<^sub>F at_right 0 \\<^sub>F at_right 0](\(n, \, \). ln (1 / of_rat \) * (ln (real n) + 1 / (of_rat \)\<^sup>2 * (ln (ln (real n)) + ln (1 / of_rat \))))" (is "_ \ O[?F](?rhs)") proof - define n_of :: "nat \ rat \ rat \ nat" where "n_of = (\(n, \, \). n)" define \_of :: "nat \ rat \ rat \ rat" where "\_of = (\(n, \, \). \)" define \_of :: "nat \ rat \ rat \ rat" where "\_of = (\(n, \, \). \)" define t_of where "t_of = (\x. nat \80 / (real_of_rat (\_of x))\<^sup>2\)" define s_of where "s_of = (\x. nat \-(18 * ln (real_of_rat (\_of x)))\)" define r_of where "r_of = (\x. nat (4 * \log 2 (1 / real_of_rat (\_of x))\ + 23))" define g where "g = (\x. ln (1 / of_rat (\_of x)) * (ln (real (n_of x)) + 1 / (of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / of_rat (\_of x)))))" have evt: "(\x. 0 < real_of_rat (\_of x) \ 0 < real_of_rat (\_of x) \ 1/real_of_rat (\_of x) \ \ \ 1/real_of_rat (\_of x) \ \ \ real (n_of x) \ n \ P x) \ eventually P ?F" (is "(\x. ?prem x \ _) \ _") for \ \ n P apply (rule eventually_mono[where P="?prem" and Q="P"]) apply (simp add:\_of_def case_prod_beta' \_of_def n_of_def) apply (intro eventually_conj eventually_prod1' eventually_prod2' sequentially_inf eventually_at_right_less inv_at_right_0_inf) by (auto simp add:prod_filter_eq_bot) have exp_pos: "exp k \ real x \ x > 0" for k x using exp_gt_zero gr0I by force have exp_gt_1: "exp 1 \ (1::real)" by simp have 1: "(\_. 1) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (auto intro!:landau_o.big_mono evt[where \="exp 1"] iffD2[OF ln_ge_iff] simp add:abs_ge_iff) have 2: "(\_. 1) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (auto intro!:landau_o.big_mono evt[where \="exp 1"] iffD2[OF ln_ge_iff] simp add:abs_ge_iff) have 3: " (\x. 1) \ O[?F](\x. ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x)))" using exp_pos by (intro landau_sum_2 2 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff], auto) have 4: "(\_. 1) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" using one_le_power by (intro landau_o.big_mono evt[where \="1"], auto simp add:power_one_over[symmetric]) have "(\x. 80 * (1 / (real_of_rat (\_of x))\<^sup>2)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" by (subst landau_o.big.cmult_in_iff, auto) hence 5: "(\x. real (t_of x)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" unfolding t_of_def by (intro landau_real_nat landau_ceil 4, auto) have "(\x. ln (real_of_rat (\_of x))) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (intro landau_o.big_mono evt[where \="1"], auto simp add:ln_div) hence 6: "(\x. real (s_of x)) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" unfolding s_of_def by (intro landau_nat_ceil 1, simp) have 7: " (\x. 1) \ O[?F](\x. ln (real (n_of x)))" using exp_pos by (auto intro!: landau_o.big_mono evt[where n="exp 1"] iffD2[OF ln_ge_iff] simp: abs_ge_iff) have 8:" (\_. 1) \ O[?F](\x. ln (real (n_of x)) + 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" using order_trans[OF exp_gt_1] exp_pos by (intro landau_sum_1 7 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff] mult_nonneg_nonneg add_nonneg_nonneg) auto have "(\x. ln (real (s_of x) + 1)) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (intro landau_ln_3 sum_in_bigo 6 1, simp) hence 9: "(\x. log 2 (real (s_of x) + 1)) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1 8, auto simp:log_def) have 10: "(\x. 1) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1 8 1) have "(\x. ln (real (t_of x) + 1)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" using 5 by (intro landau_o.big_mult_1 3 landau_ln_3 sum_in_bigo 4, simp_all) hence " (\x. log 2 (real (t_of x) + 1)) \ O[?F](\x. ln (real (n_of x)) + 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" using order_trans[OF exp_gt_1] exp_pos by (intro landau_sum_2 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff] mult_nonneg_nonneg add_nonneg_nonneg) (auto simp add:log_def) hence 11: "(\x. log 2 (real (t_of x) + 1)) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1' 1, auto) have " (\x. 1) \ O[?F](\x. real (n_of x))" by (intro landau_o.big_mono evt[where n="1"], auto) hence "(\x. ln (real (n_of x) + 21)) \ O[?F](\x. ln (real (n_of x)))" by (intro landau_ln_2[where a="2"] evt[where n="2"] sum_in_bigo, auto) hence 12: "(\x. log 2 (real (n_of x) + 21)) \ O[?F](g)" unfolding g_def using exp_pos order_trans[OF exp_gt_1] by (intro landau_o.big_mult_1' 1 landau_sum_1 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff] mult_nonneg_nonneg add_nonneg_nonneg) (auto simp add:log_def) have "(\x. ln (1 / real_of_rat (\_of x))) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" by (intro landau_ln_3 evt[where \="1"] landau_o.big_mono) (auto simp add:power_one_over[symmetric] self_le_power) hence " (\x. real (nat (4*\log 2 (1 / real_of_rat (\_of x))\+23))) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" using 4 by (auto intro!: landau_real_nat sum_in_bigo landau_ceil simp:log_def) hence " (\x. ln (real (r_of x) + 1)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" unfolding r_of_def by (intro landau_ln_3 sum_in_bigo 4, auto) hence " (\x. log 2 (real (r_of x) + 1)) \ O[?F](\x. (1 / (real_of_rat (\_of x))\<^sup>2) * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" by (intro landau_o.big_mult_1 3, simp add:log_def) hence " (\x. log 2 (real (r_of x) + 1)) \ O[?F](\x. ln (real (n_of x)) + 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" using exp_pos order_trans[OF exp_gt_1] by (intro landau_sum_2 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff] add_nonneg_nonneg mult_nonneg_nonneg) (auto) hence 13: "(\x. log 2 (real (r_of x) + 1)) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1' 1, auto) have 14: "(\x. 1) \ O[?F](\x. real (n_of x))" by (intro landau_o.big_mono evt[where n="1"], auto) have "(\x. ln (real (n_of x) + 13)) \ O[?F](\x. ln (real (n_of x)))" using 14 by (intro landau_ln_2[where a="2"] evt[where n="2"] sum_in_bigo, auto) hence "(\x. ln (log 2 (real (n_of x) + 13))) \ O[?F](\x. ln (ln (real (n_of x))))" using exp_pos by (intro landau_ln_2[where a="2"] iffD2[OF ln_ge_iff] evt[where n="exp 2"]) (auto simp add:log_def) hence "(\x. log 2 (log 2 (real (n_of x) + 13))) \ O[?F](\x. ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x)))" using exp_pos by (intro landau_sum_1 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff]) (auto simp add:log_def) moreover have "(\x. real (r_of x)) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" unfolding r_of_def using 2 by (auto intro!: landau_real_nat sum_in_bigo landau_ceil simp:log_def) hence "(\x. real (r_of x)) \ O[?F](\x. ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x)))" using exp_pos by (intro landau_sum_2 evt[where n="exp 1" and \="1"] ln_ge_zero iffD2[OF ln_ge_iff], auto) ultimately have 15:" (\x. real (t_of x) * (13 + 4 * real (r_of x) + 2 * log 2 (log 2 (real (n_of x) + 13)))) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" using 5 3 by (intro landau_o.mult sum_in_bigo, auto) have "(\x. 5 + 2 * log 2 (21 + real (n_of x)) + real (t_of x) * (13 + 4 * real (r_of x) + 2 * log 2 (log 2 (real (n_of x) + 13)))) \ O[?F](\x. ln (real (n_of x)) + 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x))))" proof - have "\\<^sub>F x in ?F. 0 \ ln (real (n_of x))" by (intro evt[where n="1"] ln_ge_zero, auto) moreover have "\\<^sub>F x in ?F. 0 \ 1 / (real_of_rat (\_of x))\<^sup>2 * (ln (ln (real (n_of x))) + ln (1 / real_of_rat (\_of x)))" using exp_pos by (intro evt[where n="exp 1" and \="1"] mult_nonneg_nonneg add_nonneg_nonneg ln_ge_zero iffD2[OF ln_ge_iff]) auto moreover have " (\x. ln (21 + real (n_of x))) \ O[?F](\x. ln (real (n_of x)))" using 14 by (intro landau_ln_2[where a="2"] sum_in_bigo evt[where n="2"], auto) hence "(\x. 5 + 2 * log 2 (21 + real (n_of x))) \ O[?F](\x. ln (real (n_of x)))" using 7 by (intro sum_in_bigo, auto simp add:log_def) ultimately show ?thesis using 15 by (rule landau_sum) qed hence 16: "(\x. real (s_of x) * (5 + 2 * log 2 (21 + real (n_of x)) + real (t_of x) * (13 + 4 * real (r_of x) + 2 * log 2 (log 2 (real (n_of x) + 13))))) \ O[?F](g)" unfolding g_def by (intro landau_o.mult 6, auto) have "f0_space_usage = (\x. f0_space_usage (n_of x, \_of x, \_of x))" by (simp add:case_prod_beta' n_of_def \_of_def \_of_def) also have "... \ O[?F](g)" using 9 10 11 12 13 16 by (simp add:fun_cong[OF s_of_def[symmetric]] fun_cong[OF t_of_def[symmetric]] fun_cong[OF r_of_def[symmetric]] Let_def) (intro sum_in_bigo, auto) also have "... = O[?F](?rhs)" by (simp add:case_prod_beta' g_def n_of_def \_of_def \_of_def) finally show ?thesis by simp qed end diff --git a/thys/Frequency_Moments/Frequency_Moment_2.thy b/thys/Frequency_Moments/Frequency_Moment_2.thy --- a/thys/Frequency_Moments/Frequency_Moment_2.thy +++ b/thys/Frequency_Moments/Frequency_Moment_2.thy @@ -1,721 +1,721 @@ section \Frequency Moment $2$\ theory Frequency_Moment_2 imports Universal_Hash_Families.Carter_Wegman_Hash_Family Equivalence_Relation_Enumeration.Equivalence_Relation_Enumeration Landau_Ext Median_Method.Median Product_PMF_Ext - Universal_Hash_Families.Field + Universal_Hash_Families.Universal_Hash_Families_More_Finite_Fields Frequency_Moments begin text \This section contains a formalization of the algorithm for the second frequency moment. It is based on the algorithm described in \<^cite>\\\textsection 2.2\ in "alon1999"\. The only difference is that the algorithm is adapted to work with prime field of odd order, which greatly reduces the implementation complexity.\ fun f2_hash where "f2_hash p h k = (if even (ring.hash (mod_ring p) k h) then int p - 1 else - int p - 1)" type_synonym f2_state = "nat \ nat \ nat \ (nat \ nat \ nat list) \ (nat \ nat \ int)" fun f2_init :: "rat \ rat \ nat \ f2_state pmf" where "f2_init \ \ n = do { let s\<^sub>1 = nat \6 / \\<^sup>2\; let s\<^sub>2 = nat \-(18 * ln (real_of_rat \))\; let p = prime_above (max n 3); h \ prod_pmf ({..1} \ {..2}) (\_. pmf_of_set (bounded_degree_polynomials (mod_ring p) 4)); return_pmf (s\<^sub>1, s\<^sub>2, p, h, (\_ \ {..1} \ {..2}. (0 :: int))) }" fun f2_update :: "nat \ f2_state \ f2_state pmf" where "f2_update x (s\<^sub>1, s\<^sub>2, p, h, sketch) = return_pmf (s\<^sub>1, s\<^sub>2, p, h, \i \ {..1} \ {..2}. f2_hash p (h i) x + sketch i)" fun f2_result :: "f2_state \ rat pmf" where "f2_result (s\<^sub>1, s\<^sub>2, p, h, sketch) = return_pmf (median s\<^sub>2 (\i\<^sub>2 \ {..2}. (\i\<^sub>1\{..1} . (rat_of_int (sketch (i\<^sub>1, i\<^sub>2)))\<^sup>2) / (((rat_of_nat p)\<^sup>2-1) * rat_of_nat s\<^sub>1)))" fun f2_space_usage :: "(nat \ nat \ rat \ rat) \ real" where "f2_space_usage (n, m, \, \) = ( let s\<^sub>1 = nat \6 / \\<^sup>2 \ in let s\<^sub>2 = nat \-(18 * ln (real_of_rat \))\ in 3 + 2 * log 2 (s\<^sub>1 + 1) + 2 * log 2 (s\<^sub>2 + 1) + 2 * log 2 (9 + 2 * real n) + s\<^sub>1 * s\<^sub>2 * (5 + 4*log 2 (8 + 2 * real n) + 2 * log 2 (real m * (18 + 4 * real n) + 1 )))" definition encode_f2_state :: "f2_state \ bool list option" where "encode_f2_state = N\<^sub>e \\<^sub>e (\s\<^sub>1. N\<^sub>e \\<^sub>e (\s\<^sub>2. N\<^sub>e \\<^sub>e (\p. (List.product [0..1] [0..2] \\<^sub>e P\<^sub>e p 4) \\<^sub>e (List.product [0..1] [0..2] \\<^sub>e I\<^sub>e))))" lemma "inj_on encode_f2_state (dom encode_f2_state)" proof - have " is_encoding encode_f2_state" unfolding encode_f2_state_def by (intro dependent_encoding exp_golomb_encoding fun_encoding list_encoding int_encoding poly_encoding) thus ?thesis by (rule encoding_imp_inj) qed context fixes \ \ :: rat fixes n :: nat fixes as :: "nat list" fixes result assumes \_range: "\ \ {0<..<1}" assumes \_range: "\ > 0" assumes as_range: "set as \ {.. fold (\a state. state \ f2_update a) as (f2_init \ \ n) \ f2_result" begin private definition s\<^sub>1 where "s\<^sub>1 = nat \6 / \\<^sup>2\" lemma s1_gt_0: "s\<^sub>1 > 0" using \_range by (simp add:s\<^sub>1_def) private definition s\<^sub>2 where "s\<^sub>2 = nat \-(18* ln (real_of_rat \))\" lemma s2_gt_0: "s\<^sub>2 > 0" using \_range by (simp add:s\<^sub>2_def) private definition p where "p = prime_above (max n 3)" lemma p_prime: "Factorial_Ring.prime p" unfolding p_def using prime_above_prime by blast lemma p_ge_3: "p \ 3" unfolding p_def by (meson max.boundedE prime_above_lower_bound) lemma p_gt_0: "p > 0" using p_ge_3 by linarith lemma p_gt_1: "p > 1" using p_ge_3 by simp lemma p_ge_n: "p \ n" unfolding p_def by (meson max.boundedE prime_above_lower_bound ) interpretation carter_wegman_hash_family "mod_ring p" 4 using carter_wegman_hash_familyI[OF mod_ring_is_field mod_ring_finite] using p_prime by auto definition sketch where "sketch = fold (\a state. state \ f2_update a) as (f2_init \ \ n)" private definition \ where"\ = prod_pmf ({..1} \ {..2}) (\_. pmf_of_set space)" private definition \\<^sub>p where"\\<^sub>p = measure_pmf \" private definition sketch_rv where "sketch_rv \ = of_int (sum_list (map (f2_hash p \) as))^2" private definition mean_rv where "mean_rv \ = (\i\<^sub>2. (\i\<^sub>1 = 0..1. sketch_rv (\ (i\<^sub>1, i\<^sub>2))) / (((of_nat p)\<^sup>2 - 1) * of_nat s\<^sub>1))" private definition result_rv where "result_rv \ = median s\<^sub>2 (\i\<^sub>2\{..2}. mean_rv \ i\<^sub>2)" lemma mean_rv_alg_sketch: "sketch = \ \ (\\. return_pmf (s\<^sub>1, s\<^sub>2, p, \, \i \ {..1} \ {..2}. sum_list (map (f2_hash p (\ i)) as)))" proof - have "sketch = fold (\a state. state \ f2_update a) as (f2_init \ \ n)" by (simp add:sketch_def) also have "... = \ \ (\\. return_pmf (s\<^sub>1, s\<^sub>2, p, \, \i \ {..1} \ {..2}. sum_list (map (f2_hash p (\ i)) as)))" proof (induction as rule:rev_induct) case Nil then show ?case by (simp add:s\<^sub>1_def s\<^sub>2_def space_def p_def[symmetric] \_def restrict_def Let_def) next case (snoc a as) have "fold (\a state. state \ f2_update a) (as @ [a]) (f2_init \ \ n) = \ \ (\\. return_pmf (s\<^sub>1, s\<^sub>2, p, \, \s \ {..1} \ {..2}. (\x \ as. f2_hash p (\ s) x)) \ f2_update a)" using snoc by (simp add: bind_assoc_pmf restrict_def del:f2_hash.simps f2_init.simps) also have "... = \ \ (\\. return_pmf (s\<^sub>1, s\<^sub>2, p, \, \i \ {..1} \ {..2}. (\x \ as@[a]. f2_hash p (\ i) x)))" by (subst bind_return_pmf) (simp add: add.commute del:f2_hash.simps cong:restrict_cong) finally show ?case by blast qed finally show ?thesis by auto qed lemma distr: "result = map_pmf result_rv \" proof - have "result = sketch \ f2_result" by (simp add:result_def sketch_def) also have "... = \ \ (\x. f2_result (s\<^sub>1, s\<^sub>2, p, x, \i\{..1} \ {..2}. sum_list (map (f2_hash p (x i)) as)))" by (simp add: mean_rv_alg_sketch bind_assoc_pmf bind_return_pmf) also have "... = map_pmf result_rv \" by (simp add:map_pmf_def result_rv_def mean_rv_def sketch_rv_def lessThan_atLeast0 cong:restrict_cong) finally show ?thesis by simp qed private lemma f2_hash_pow_exp: assumes "k < p" shows "expectation (\\. real_of_int (f2_hash p \ k) ^m) = ((real p - 1) ^ m * (real p + 1) + (- real p - 1) ^ m * (real p - 1)) / (2 * real p)" proof - have "odd p" using p_prime p_ge_3 prime_odd_nat assms by simp then obtain t where t_def: "p=2*t+1" using oddE by blast have "Collect even \ {..<2 * t + 1} \ (*) 2 ` {.. Collect even \ {..<2 * t + 1}" by (rule image_subsetI, simp) ultimately have "card ({k. even k} \ {..x. 2*x) ` {.. {.. {... hash k \ \ Collect even} = (real p + 1)/(2*real p)" using assms by (subst prob_range, auto simp:frac_eq_eq p_gt_0 mod_ring_def) have "p = card {.. {.. ({k. even k} \ {.. {.. {.. {.. {.. {... hash k \ \ Collect odd} = (real p - 1)/(2*real p)" using assms by (subst prob_range, auto simp add: frac_eq_eq mod_ring_def) have "expectation (\x. real_of_int (f2_hash p x k) ^ m) = expectation (\\. indicator {\. even (hash k \)} \ * (real p - 1)^m + indicator {\. odd (hash k \)} \ * (-real p - 1)^m)" by (rule Bochner_Integration.integral_cong, simp, simp) also have "... = prob {\. hash k \ \ Collect even} * (real p - 1) ^ m + prob {\. hash k \ \ Collect odd} * (-real p - 1) ^ m " by (simp, simp add:M_def) also have "... = (real p + 1) * (real p - 1) ^ m / (2 * real p) + (real p - 1) * (- real p - 1) ^ m / (2 * real p)" by (subst prob_even, subst prob_odd, simp) also have "... = ((real p - 1) ^ m * (real p + 1) + (- real p - 1) ^ m * (real p - 1)) / (2 * real p)" by (simp add:add_divide_distrib ac_simps) finally show "expectation (\x. real_of_int (f2_hash p x k) ^ m) = ((real p - 1) ^ m * (real p + 1) + (- real p - 1) ^ m * (real p - 1)) / (2 * real p)" by simp qed lemma shows var_sketch_rv:"variance sketch_rv \ 2*(real_of_rat (F 2 as)^2) * ((real p)\<^sup>2-1)\<^sup>2" (is "?A") and exp_sketch_rv:"expectation sketch_rv = real_of_rat (F 2 as) * ((real p)\<^sup>2-1)" (is "?B") proof - define h where "h = (\\ x. real_of_int (f2_hash p \ x))" define c where "c = (\x. real (count_list as x))" define r where "r = (\(m::nat). ((real p - 1) ^ m * (real p + 1) + (- real p - 1) ^ m * (real p - 1)) / (2 * real p))" define h_prod where "h_prod = (\as \. prod_list (map (h \) as))" define exp_h_prod :: "nat list \ real" where "exp_h_prod = (\as. (\i \ set as. r (count_list as i)))" have f_eq: "sketch_rv = (\\. (\x \ set as. c x * h \ x)^2)" by (rule ext, simp add:sketch_rv_def c_def h_def sum_list_eval del:f2_hash.simps) have r_one: "r (Suc 0) = 0" by (simp add:r_def algebra_simps) have r_two: "r 2 = (real p^2-1)" using p_gt_0 unfolding r_def power2_eq_square by (simp add:nonzero_divide_eq_eq, simp add:algebra_simps) have"(real p)^2 \ 2^2" by (rule power_mono, use p_gt_1 in linarith, simp) hence p_square_ge_4: "(real p)\<^sup>2 \ 4" by simp have "r 4 = (real p)^4+2*(real p)\<^sup>2 - 3" using p_gt_0 unfolding r_def by (subst nonzero_divide_eq_eq, auto simp:power4_eq_xxxx power2_eq_square algebra_simps) also have "... \ (real p)^4+2*(real p)\<^sup>2 + 3" by simp also have "... \ 3 * r 2 * r 2" using p_square_ge_4 by (simp add:r_two power4_eq_xxxx power2_eq_square algebra_simps mult_left_mono) finally have r_four_est: "r 4 \ 3 * r 2 * r 2" by simp have exp_h_prod_elim: "exp_h_prod = (\as. prod_list (map (r \ count_list as) (remdups as)))" by (simp add:exp_h_prod_def prod.set_conv_list[symmetric]) have exp_h_prod: "\x. set x \ set as \ length x \ 4 \ expectation (h_prod x) = exp_h_prod x" proof - fix x assume "set x \ set as" hence x_sub_p: "set x \ {..k. k \ set x \ k < p" by auto assume "length x \ 4" hence card_x: "card (set x) \ 4" using card_length dual_order.trans by blast have "set x \ carrier (mod_ring p) " using x_sub_p by (simp add:mod_ring_def) hence h_indep: "indep_vars (\_. borel) (\i \. h \ i ^ count_list x i) (set x)" using k_wise_indep_vars_subset[OF k_wise_indep] card_x as_range h_def by (auto intro:indep_vars_compose2[where X="hash" and M'=" (\_. discrete)"]) have "expectation (h_prod x) = expectation (\\. \ i \ set x. h \ i^(count_list x i))" by (simp add:h_prod_def prod_list_eval) also have "... = (\i \ set x. expectation (\\. h \ i^(count_list x i)))" by (simp add: indep_vars_lebesgue_integral[OF _ h_indep]) also have "... = (\i \ set x. r (count_list x i))" using f2_hash_pow_exp x_le_p by (simp add:h_def r_def M_def[symmetric] del:f2_hash.simps) also have "... = exp_h_prod x" by (simp add:exp_h_prod_def) finally show "expectation (h_prod x) = exp_h_prod x" by simp qed have "\x y. kernel_of x = kernel_of y \ exp_h_prod x = exp_h_prod y" proof - fix x y :: "nat list" assume a:"kernel_of x = kernel_of y" then obtain f where b:"bij_betw f (set x) (set y)" and c:"\z. z \ set x \ count_list x z = count_list y (f z)" using kernel_of_eq_imp_bij by blast have "exp_h_prod x = prod ( (\i. r(count_list y i)) \ f) (set x)" by (simp add:exp_h_prod_def c) also have "... = (\i \ f ` (set x). r(count_list y i))" by (metis b bij_betw_def prod.reindex) also have "... = exp_h_prod y" unfolding exp_h_prod_def by (rule prod.cong, metis b bij_betw_def) simp finally show "exp_h_prod x = exp_h_prod y" by simp qed hence exp_h_prod_cong: "\p x. of_bool (kernel_of x = kernel_of p) * exp_h_prod p = of_bool (kernel_of x = kernel_of p) * exp_h_prod x" by (metis (full_types) of_bool_eq_0_iff vector_space_over_itself.scale_zero_left) have c:"(\p\enum_rgfs n. of_bool (kernel_of xs = kernel_of p) * r) = r" if a:"length xs = n" for xs :: "nat list" and n and r :: real proof - have "(\p\enum_rgfs n. of_bool (kernel_of xs = kernel_of p) * 1) = (1::real)" using equiv_rels_2[OF a[symmetric]] by (simp add:equiv_rels_def comp_def) thus "(\p\enum_rgfs n. of_bool (kernel_of xs = kernel_of p) * r) = (r::real)" by (simp add:sum_list_mult_const) qed have "expectation sketch_rv = (\i\set as. (\j\set as. c i * c j * expectation (h_prod [i,j])))" by (simp add:f_eq h_prod_def power2_eq_square sum_distrib_left sum_distrib_right Bochner_Integration.integral_sum algebra_simps) also have "... = (\i\set as. (\j\set as. c i * c j * exp_h_prod [i,j]))" by (simp add:exp_h_prod) also have "... = (\i \ set as. (\j \ set as. c i * c j * (sum_list (map (\p. of_bool (kernel_of [i,j] = kernel_of p) * exp_h_prod p) (enum_rgfs 2)))))" by (subst exp_h_prod_cong, simp add:c) also have "... = (\i\set as. c i * c i * r 2)" by (simp add: numeral_eq_Suc kernel_of_eq All_less_Suc exp_h_prod_elim r_one distrib_left sum.distrib sum_collapse) also have "... = real_of_rat (F 2 as) * ((real p)^2-1)" by (simp add: sum_distrib_right[symmetric] c_def F_def power2_eq_square of_rat_sum of_rat_mult r_two) finally show b:?B by simp have "expectation (\x. (sketch_rv x)\<^sup>2) = (\i1 \ set as. (\i2 \ set as. (\i3 \ set as. (\i4 \ set as. c i1 * c i2 * c i3 * c i4 * expectation (h_prod [i1, i2, i3, i4])))))" by (simp add:f_eq h_prod_def power4_eq_xxxx sum_distrib_left sum_distrib_right Bochner_Integration.integral_sum algebra_simps) also have "... = (\i1 \ set as. (\i2 \ set as. (\i3 \ set as. (\i4 \ set as. c i1 * c i2 * c i3 * c i4 * exp_h_prod [i1,i2,i3,i4]))))" by (simp add:exp_h_prod) also have "... = (\i1 \ set as. (\i2 \ set as. (\i3 \ set as. (\i4 \ set as. c i1 * c i2 * c i3 * c i4 * (sum_list (map (\p. of_bool (kernel_of [i1,i2,i3,i4] = kernel_of p) * exp_h_prod p) (enum_rgfs 4)))))))" by (subst exp_h_prod_cong, simp add:c) also have "... = 3 * (\i \ set as. (\j \ set as. c i^2 * c j^2 * r 2 * r 2)) + ((\ i \ set as. c i^4 * r 4) - 3 * (\ i \ set as. c i ^ 4 * r 2 * r 2))" apply (simp add: numeral_eq_Suc exp_h_prod_elim r_one) (* large intermediate terms *) apply (simp add: kernel_of_eq All_less_Suc numeral_eq_Suc distrib_left sum.distrib sum_collapse neq_commute of_bool_not_iff) apply (simp add: algebra_simps sum_subtractf sum_collapse) apply (simp add: sum_distrib_left algebra_simps) done also have "... = 3 * (\i \ set as. c i^2 * r 2)^2 + (\ i \ set as. c i ^ 4 * (r 4 - 3 * r 2 * r 2))" by (simp add:power2_eq_square sum_distrib_left algebra_simps sum_subtractf) also have "... = 3 * (\i \ set as. c i^2)^2 * (r 2)^2 + (\i \ set as. c i ^ 4 * (r 4 - 3 * r 2 * r 2))" by (simp add:power_mult_distrib sum_distrib_right[symmetric]) also have "... \ 3 * (\i \ set as. c i^2)^2 * (r 2)^2 + (\i \ set as. c i ^ 4 * 0)" using r_four_est by (auto intro!: sum_nonpos simp add:mult_nonneg_nonpos) also have "... = 3 * (real_of_rat (F 2 as)^2) * ((real p)\<^sup>2-1)\<^sup>2" by (simp add:c_def r_two F_def of_rat_sum of_rat_power) finally have "expectation (\x. (sketch_rv x)\<^sup>2) \ 3 * (real_of_rat (F 2 as)^2) * ((real p)\<^sup>2-1)\<^sup>2" by simp thus "variance sketch_rv \ 2*(real_of_rat (F 2 as)^2) * ((real p)\<^sup>2-1)\<^sup>2" by (simp add: variance_eq, simp add:power_mult_distrib b) qed lemma space_omega_1 [simp]: "Sigma_Algebra.space \\<^sub>p = UNIV" by (simp add:\\<^sub>p_def) interpretation \: prob_space "\\<^sub>p" by (simp add:\\<^sub>p_def prob_space_measure_pmf) lemma integrable_\: fixes f :: "((nat \ nat) \ (nat list)) \ real" shows "integrable \\<^sub>p f" unfolding \\<^sub>p_def \_def by (rule integrable_measure_pmf_finite, auto intro:finite_PiE simp:set_prod_pmf) lemma sketch_rv_exp: assumes "i\<^sub>2 < s\<^sub>2" assumes "i\<^sub>1 \ {0..1}" shows "\.expectation (\\. sketch_rv (\ (i\<^sub>1, i\<^sub>2))) = real_of_rat (F 2 as) * ((real p)\<^sup>2 - 1)" proof - have "\.expectation (\\. (sketch_rv (\ (i\<^sub>1, i\<^sub>2))) :: real) = expectation sketch_rv" using integrable_\ integrable_M assms unfolding \_def \\<^sub>p_def M_def by (subst expectation_Pi_pmf_slice, auto) also have "... = (real_of_rat (F 2 as)) * ((real p)\<^sup>2 - 1)" using exp_sketch_rv by simp finally show ?thesis by simp qed lemma sketch_rv_var: assumes "i\<^sub>2 < s\<^sub>2" assumes "i\<^sub>1 \ {0..1}" shows "\.variance (\\. sketch_rv (\ (i\<^sub>1, i\<^sub>2))) \ 2 * (real_of_rat (F 2 as))\<^sup>2 * ((real p)\<^sup>2 - 1)\<^sup>2" proof - have "\.variance (\\. (sketch_rv (\ (i\<^sub>1, i\<^sub>2)) :: real)) = variance sketch_rv" using integrable_\ integrable_M assms unfolding \_def \\<^sub>p_def M_def by (subst variance_prod_pmf_slice, auto) also have "... \ 2 * (real_of_rat (F 2 as))\<^sup>2 * ((real p)\<^sup>2 - 1)\<^sup>2" using var_sketch_rv by simp finally show ?thesis by simp qed lemma mean_rv_exp: assumes "i < s\<^sub>2" shows "\.expectation (\\. mean_rv \ i) = real_of_rat (F 2 as)" proof - have a:"(real p)\<^sup>2 > 1" using p_gt_1 by simp have "\.expectation (\\. mean_rv \ i) = (\i\<^sub>1 = 0..1. \.expectation (\\. sketch_rv (\ (i\<^sub>1, i)))) / (((real p)\<^sup>2 - 1) * real s\<^sub>1)" using assms integrable_\ by (simp add:mean_rv_def) also have "... = (\i\<^sub>1 = 0..1. real_of_rat (F 2 as) * ((real p)\<^sup>2 - 1)) / (((real p)\<^sup>2 - 1) * real s\<^sub>1)" using sketch_rv_exp[OF assms] by simp also have "... = real_of_rat (F 2 as)" using s1_gt_0 a by simp finally show ?thesis by simp qed lemma mean_rv_var: assumes "i < s\<^sub>2" shows "\.variance (\\. mean_rv \ i) \ (real_of_rat (\ * F 2 as))\<^sup>2 / 3" proof - have a: "\.indep_vars (\_. borel) (\i\<^sub>1 x. sketch_rv (x (i\<^sub>1, i))) {0..1}" using assms unfolding \\<^sub>p_def \_def by (intro indep_vars_restrict_intro'[where f="fst"]) (auto simp add: restrict_dfl_def case_prod_beta lessThan_atLeast0) have p_sq_ne_1: "(real p)^2 \ 1" by (metis p_gt_1 less_numeral_extra(4) of_nat_power one_less_power pos2 semiring_char_0_class.of_nat_eq_1_iff) have s1_bound: " 6 / (real_of_rat \)\<^sup>2 \ real s\<^sub>1" unfolding s\<^sub>1_def by (metis (mono_tags, opaque_lifting) of_rat_ceiling of_rat_divide of_rat_numeral_eq of_rat_power real_nat_ceiling_ge) have "\.variance (\\. mean_rv \ i) = \.variance (\\. \i\<^sub>1 = 0..1. sketch_rv (\ (i\<^sub>1, i))) / (((real p)\<^sup>2 - 1) * real s\<^sub>1)\<^sup>2" unfolding mean_rv_def by (subst \.variance_divide[OF integrable_\], simp) also have "... = (\i\<^sub>1 = 0..1. \.variance (\\. sketch_rv (\ (i\<^sub>1, i)))) / (((real p)\<^sup>2 - 1) * real s\<^sub>1)\<^sup>2" by (subst \.var_sum_all_indep[OF _ _ integrable_\ a]) (auto simp: \_def \\<^sub>p_def) also have "... \ (\i\<^sub>1 = 0..1. 2*(real_of_rat (F 2 as)^2) * ((real p)\<^sup>2-1)\<^sup>2) / (((real p)\<^sup>2 - 1) * real s\<^sub>1)\<^sup>2" by (rule divide_right_mono, rule sum_mono[OF sketch_rv_var[OF assms]], auto) also have "... = 2 * (real_of_rat (F 2 as)^2) / real s\<^sub>1" using p_sq_ne_1 s1_gt_0 by (subst frac_eq_eq, auto simp:power2_eq_square) also have "... \ 2 * (real_of_rat (F 2 as)^2) / (6 / (real_of_rat \)\<^sup>2)" using s1_gt_0 \_range by (intro divide_left_mono mult_pos_pos s1_bound) auto also have "... = (real_of_rat (\ * F 2 as))\<^sup>2 / 3" by (simp add:of_rat_mult algebra_simps) finally show ?thesis by simp qed lemma mean_rv_bounds: assumes "i < s\<^sub>2" shows "\.prob {\. real_of_rat \ * real_of_rat (F 2 as) < \mean_rv \ i - real_of_rat (F 2 as)\} \ 1/3" proof (cases "as = []") case True then show ?thesis using assms by (subst mean_rv_def, subst sketch_rv_def, simp add:F_def) next case False hence "F 2 as > 0" using F_gr_0 by auto hence a: "0 < real_of_rat (\ * F 2 as)" using \_range by simp have [simp]: "(\\. mean_rv \ i) \ borel_measurable \\<^sub>p" by (simp add:\_def \\<^sub>p_def) have "\.prob {\. real_of_rat \ * real_of_rat (F 2 as) < \mean_rv \ i - real_of_rat (F 2 as)\} \ \.prob {\. real_of_rat (\ * F 2 as) \ \mean_rv \ i - real_of_rat (F 2 as)\}" by (rule \.pmf_mono[OF \\<^sub>p_def], simp add:of_rat_mult) also have "... \ \.variance (\\. mean_rv \ i) / (real_of_rat (\ * F 2 as))\<^sup>2" using \.Chebyshev_inequality[where a="real_of_rat (\ * F 2 as)" and f="\\. mean_rv \ i",simplified] a prob_space_measure_pmf[where p="\"] mean_rv_exp[OF assms] integrable_\ by simp also have "... \ ((real_of_rat (\ * F 2 as))\<^sup>2/3) / (real_of_rat (\ * F 2 as))\<^sup>2" by (rule divide_right_mono, rule mean_rv_var[OF assms], simp) also have "... = 1/3" using a by force finally show ?thesis by blast qed lemma f2_alg_correct': "\

(\ in measure_pmf result. \\ - F 2 as\ \ \ * F 2 as) \ 1-of_rat \" proof - have a: "\.indep_vars (\_. borel) (\i \. mean_rv \ i) {0..2}" using s1_gt_0 unfolding \\<^sub>p_def \_def by (intro indep_vars_restrict_intro'[where f="snd"]) (auto simp: \\<^sub>p_def \_def mean_rv_def restrict_dfl_def) have b: "- 18 * ln (real_of_rat \) \ real s\<^sub>2" unfolding s\<^sub>2_def using of_nat_ceiling by auto have "1 - of_rat \ \ \.prob {\. \median s\<^sub>2 (mean_rv \) - real_of_rat (F 2 as) \ \ of_rat \ * of_rat (F 2 as)}" using \_range \.median_bound_2[OF _ a b, where \="real_of_rat \ * real_of_rat (F 2 as)" and \="real_of_rat (F 2 as)"] mean_rv_bounds by simp also have "... = \.prob {\. \real_of_rat (result_rv \) - of_rat (F 2 as) \ \ of_rat \ * of_rat (F 2 as)}" by (simp add:result_rv_def median_restrict lessThan_atLeast0 median_rat[OF s2_gt_0] mean_rv_def sketch_rv_def of_rat_divide of_rat_sum of_rat_mult of_rat_diff of_rat_power) also have "... = \.prob {\. \result_rv \ - F 2 as\ \ \ * F 2 as} " by (simp add:of_rat_less_eq of_rat_mult[symmetric] of_rat_diff[symmetric] set_eq_iff) finally have "\.prob {y. \result_rv y - F 2 as\ \ \ * F 2 as} \ 1-of_rat \ " by simp thus ?thesis by (simp add: distr \\<^sub>p_def) qed lemma f2_exact_space_usage': "AE \ in sketch . bit_count (encode_f2_state \) \ f2_space_usage (n, length as, \, \)" proof - have "p \ 2 * max n 3 + 2" by (subst p_def, rule prime_above_upper_bound) also have "... \ 2 * n + 8" by (cases "n \ 2", simp_all) finally have p_bound: "p \ 2 * n + 8" by simp have "bit_count (N\<^sub>e p) \ ereal (2 * log 2 (real p + 1) + 1)" by (rule exp_golomb_bit_count) also have "... \ ereal (2 * log 2 (2 * real n + 9) + 1)" using p_bound by simp finally have p_bit_count: "bit_count (N\<^sub>e p) \ ereal (2 * log 2 (2 * real n + 9) + 1)" by simp have a: "bit_count (encode_f2_state (s\<^sub>1, s\<^sub>2, p, y, \i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as))) \ ereal (f2_space_usage (n, length as, \, \))" if a:"y\{..1} \ {..2} \\<^sub>E bounded_degree_polynomials (mod_ring p) 4" for y proof - have "y \ extensional ({..1} \ {..2})" using a PiE_iff by blast hence y_ext: "y \ extensional (set (List.product [0..1] [0..2]))" by (simp add:lessThan_atLeast0) have h_bit_count_aux: "bit_count (P\<^sub>e p 4 (y x)) \ ereal (4 + 4 * log 2 (8 + 2 * real n))" if b:"x \ set (List.product [0..1] [0..2])" for x proof - have "y x \ bounded_degree_polynomials (Field.mod_ring p) 4" using b a by force hence "bit_count (P\<^sub>e p 4 (y x)) \ ereal ( real 4 * (log 2 (real p) + 1))" by (rule bounded_degree_polynomial_bit_count[OF p_gt_1] ) also have "... \ ereal (real 4 * (log 2 (8 + 2 * real n) + 1) )" using p_gt_0 p_bound by simp also have "... \ ereal (4 + 4 * log 2 (8 + 2 * real n))" by simp finally show ?thesis by blast qed have h_bit_count: "bit_count ((List.product [0..1] [0..2] \\<^sub>e P\<^sub>e p 4) y) \ ereal (real s\<^sub>1 * real s\<^sub>2 * (4 + 4 * log 2 (8 + 2 * real n)))" using fun_bit_count_est[where e="P\<^sub>e p 4", OF y_ext h_bit_count_aux] by simp have sketch_bit_count_aux: "bit_count (I\<^sub>e (sum_list (map (f2_hash p (y x)) as))) \ ereal (1 + 2 * log 2 (real (length as) * (18 + 4 * real n) + 1))" (is "?lhs \ ?rhs") if " x \ {0..1} \ {0..2}" for x proof - have "\sum_list (map (f2_hash p (y x)) as)\ \ sum_list (map (abs \ (f2_hash p (y x))) as)" by (subst map_map[symmetric]) (rule sum_list_abs) also have "... \ sum_list (map (\_. (int p+1)) as)" by (rule sum_list_mono) (simp add:p_gt_0) also have "... = int (length as) * (int p+1)" by (simp add: sum_list_triv) also have "... \ int (length as) * (9+2*(int n))" using p_bound by (intro mult_mono, auto) finally have "\sum_list (map (f2_hash p (y x)) as)\ \ int (length as) * (9 + 2 * int n)" by simp hence "?lhs \ ereal (2 * log 2 (real_of_int (2* (int (length as) * (9 + 2 * int n)) + 1)) + 1)" by (rule int_bit_count_est) also have "... = ?rhs" by (simp add:algebra_simps) finally show "?thesis" by simp qed have "bit_count ((List.product [0..1] [0..2] \\<^sub>e I\<^sub>e) (\i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as))) \ ereal (real (length (List.product [0..1] [0..2]))) * (ereal (1 + 2 * log 2 (real (length as) * (18 + 4 * real n) + 1)))" by (intro fun_bit_count_est) (simp_all add:extensional_def lessThan_atLeast0 sketch_bit_count_aux del:f2_hash.simps) also have "... = ereal (real s\<^sub>1 * real s\<^sub>2 * (1 + 2 * log 2 (real (length as) * (18 + 4 * real n) + 1)))" by simp finally have sketch_bit_count: "bit_count ((List.product [0..1] [0..2] \\<^sub>e I\<^sub>e) (\i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as))) \ ereal (real s\<^sub>1 * real s\<^sub>2 * (1 + 2 * log 2 (real (length as) * (18 + 4 * real n) + 1)))" by simp have "bit_count (encode_f2_state (s\<^sub>1, s\<^sub>2, p, y, \i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as))) \ bit_count (N\<^sub>e s\<^sub>1) + bit_count (N\<^sub>e s\<^sub>2) +bit_count (N\<^sub>e p) + bit_count ((List.product [0..1] [0..2] \\<^sub>e P\<^sub>e p 4) y) + bit_count ((List.product [0..1] [0..2] \\<^sub>e I\<^sub>e) (\i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as)))" by (simp add:Let_def s\<^sub>1_def s\<^sub>2_def encode_f2_state_def dependent_bit_count add.assoc) also have "... \ ereal (2 * log 2 (real s\<^sub>1 + 1) + 1) + ereal (2 * log 2 (real s\<^sub>2 + 1) + 1) + ereal (2 * log 2 (2 * real n + 9) + 1) + (ereal (real s\<^sub>1 * real s\<^sub>2) * (4 + 4 * log 2 (8 + 2 * real n))) + (ereal (real s\<^sub>1 * real s\<^sub>2) * (1 + 2 * log 2 (real (length as) * (18 + 4 * real n) + 1) ))" by (intro add_mono exp_golomb_bit_count p_bit_count, auto intro: h_bit_count sketch_bit_count) also have "... = ereal (f2_space_usage (n, length as, \, \))" by (simp add:distrib_left add.commute s\<^sub>1_def[symmetric] s\<^sub>2_def[symmetric] Let_def) finally show "bit_count (encode_f2_state (s\<^sub>1, s\<^sub>2, p, y, \i\{..1} \ {..2}. sum_list (map (f2_hash p (y i)) as))) \ ereal (f2_space_usage (n, length as, \, \))" by simp qed have "set_pmf \ = {..1} \ {..2} \\<^sub>E bounded_degree_polynomials (Field.mod_ring p) 4" by (simp add: \_def set_prod_pmf) (simp add: space_def) thus ?thesis by (simp add:mean_rv_alg_sketch AE_measure_pmf_iff del:f2_space_usage.simps, metis a) qed end text \Main results of this section:\ theorem f2_alg_correct: assumes "\ \ {0<..<1}" assumes "\ > 0" assumes "set as \ {.. \ fold (\a state. state \ f2_update a) as (f2_init \ \ n) \ f2_result" shows "\

(\ in measure_pmf \. \\ - F 2 as\ \ \ * F 2 as) \ 1-of_rat \" using f2_alg_correct'[OF assms(1,2,3)] \_def by auto theorem f2_exact_space_usage: assumes "\ \ {0<..<1}" assumes "\ > 0" assumes "set as \ {.. fold (\a state. state \ f2_update a) as (f2_init \ \ n)" shows "AE \ in M. bit_count (encode_f2_state \) \ f2_space_usage (n, length as, \, \)" using f2_exact_space_usage'[OF assms(1,2,3)] by (subst (asm) sketch_def[OF assms(1,2,3)], subst M_def, simp) theorem f2_asymptotic_space_complexity: "f2_space_usage \ O[at_top \\<^sub>F at_top \\<^sub>F at_right 0 \\<^sub>F at_right 0](\ (n, m, \, \). (ln (1 / of_rat \)) / (of_rat \)\<^sup>2 * (ln (real n) + ln (real m)))" (is "_ \ O[?F](?rhs)") proof - define n_of :: "nat \ nat \ rat \ rat \ nat" where "n_of = (\(n, m, \, \). n)" define m_of :: "nat \ nat \ rat \ rat \ nat" where "m_of = (\(n, m, \, \). m)" define \_of :: "nat \ nat \ rat \ rat \ rat" where "\_of = (\(n, m, \, \). \)" define \_of :: "nat \ nat \ rat \ rat \ rat" where "\_of = (\(n, m, \, \). \)" define g where "g = (\x. (1/ (of_rat (\_of x))\<^sup>2) * (ln (1 / of_rat (\_of x))) * (ln (real (n_of x)) + ln (real (m_of x))))" have evt: "(\x. 0 < real_of_rat (\_of x) \ 0 < real_of_rat (\_of x) \ 1/real_of_rat (\_of x) \ \ \ 1/real_of_rat (\_of x) \ \ \ real (n_of x) \ n \ real (m_of x) \ m\ P x) \ eventually P ?F" (is "(\x. ?prem x \ _) \ _") for \ \ n m P apply (rule eventually_mono[where P="?prem" and Q="P"]) apply (simp add:\_of_def case_prod_beta' \_of_def n_of_def m_of_def) apply (intro eventually_conj eventually_prod1' eventually_prod2' sequentially_inf eventually_at_right_less inv_at_right_0_inf) by (auto simp add:prod_filter_eq_bot) have unit_1: "(\_. 1) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" using one_le_power by (intro landau_o.big_mono evt[where \="1"], auto simp add:power_one_over[symmetric]) have unit_2: "(\_. 1) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (intro landau_o.big_mono evt[where \="exp 1"]) (auto intro!:iffD2[OF ln_ge_iff] simp add:abs_ge_iff) have unit_3: "(\_. 1) \ O[?F](\x. real (n_of x))" by (intro landau_o.big_mono evt, auto) have unit_4: "(\_. 1) \ O[?F](\x. real (m_of x))" by (intro landau_o.big_mono evt, auto) have unit_5: "(\_. 1) \ O[?F](\x. ln (real (n_of x)))" by (auto intro!: landau_o.big_mono evt[where n="exp 1"]) (metis abs_ge_self linorder_not_le ln_ge_iff not_exp_le_zero order.trans) have unit_6: "(\_. 1) \ O[?F](\x. ln (real (n_of x)) + ln (real (m_of x)))" by (intro landau_sum_1 evt unit_5 iffD2[OF ln_ge_iff], auto) have unit_7: "(\_. 1) \ O[?F](\x. 1 / real_of_rat (\_of x))" by (intro landau_o.big_mono evt[where \="1"], auto) have unit_8: "(\_. 1) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1 unit_1 unit_2 unit_6) have unit_9: "(\_. 1) \ O[?F](\x. real (n_of x) * real (m_of x))" by (intro landau_o.big_mult_1 unit_3 unit_4) have " (\x. 6 * (1 / (real_of_rat (\_of x))\<^sup>2)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" by (subst landau_o.big.cmult_in_iff, simp_all) hence l1: "(\x. real (nat \6 / (\_of x)\<^sup>2\)) \ O[?F](\x. 1 / (real_of_rat (\_of x))\<^sup>2)" by (intro landau_real_nat landau_rat_ceil[OF unit_1]) (simp_all add:of_rat_divide of_rat_power) have "(\x. - ( ln (real_of_rat (\_of x)))) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (intro landau_o.big_mono evt) (subst ln_div, auto) hence l2: "(\x. real (nat \- (18 * ln (real_of_rat (\_of x)))\)) \ O[?F](\x. ln (1 / real_of_rat (\_of x)))" by (intro landau_real_nat landau_ceil[OF unit_2], simp) have l3_aux: " (\x. real (m_of x) * (18 + 4 * real (n_of x)) + 1) \ O[?F](\x. real (n_of x) * real (m_of x))" by (rule sum_in_bigo[OF _unit_9], subst mult.commute) (intro landau_o.mult sum_in_bigo, auto simp:unit_3) have "(\x. ln (real (m_of x) * (18 + 4 * real (n_of x)) + 1)) \ O[?F](\x. ln (real (n_of x) * real (m_of x)))" apply (rule landau_ln_2[where a="2"], simp, simp) apply (rule evt[where m="2" and n="1"]) apply (metis dual_order.trans mult_left_mono mult_of_nat_commute of_nat_0_le_iff verit_prod_simplify(1)) using l3_aux by simp also have "(\x. ln (real (n_of x) * real (m_of x))) \ O[?F](\x. ln (real (n_of x)) + ln(real (m_of x)))" by (intro landau_o.big_mono evt[where m="1" and n="1"], auto simp add:ln_mult) finally have l3: "(\x. ln (real (m_of x) * (18 + 4 * real (n_of x)) + 1)) \ O[?F](\x. ln (real (n_of x)) + ln (real (m_of x)))" using landau_o.big_trans by simp have l4: "(\x. ln (8 + 2 * real (n_of x))) \ O[?F](\x. ln (real (n_of x)) + ln (real (m_of x)))" by (intro landau_sum_1 evt[where n="2"] landau_ln_2[where a="2"] iffD2[OF ln_ge_iff]) (auto intro!: sum_in_bigo simp add:unit_3) have l5: "(\x. ln (9 + 2 * real (n_of x))) \ O[?F](\x. ln (real (n_of x)) + ln (real (m_of x)))" by (intro landau_sum_1 evt[where n="2"] landau_ln_2[where a="2"] iffD2[OF ln_ge_iff]) (auto intro!: sum_in_bigo simp add:unit_3) have l6: "(\x. ln (real (nat \6 / (\_of x)\<^sup>2\) + 1)) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1 landau_ln_3 sum_in_bigo unit_6 unit_2 l1 unit_1, simp) have l7: "(\x. ln (9 + 2 * real (n_of x))) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1' unit_1 unit_2 l5) have l8: "(\x. ln (real (nat \- (18 * ln (real_of_rat (\_of x)))\) + 1) ) \ O[?F](g)" unfolding g_def by (intro landau_o.big_mult_1 unit_6 landau_o.big_mult_1' unit_1 landau_ln_3 sum_in_bigo l2 unit_2) simp have l9: "(\x. 5 + 4 * ln (8 + 2 * real (n_of x)) / ln 2 + 2 * ln (real (m_of x) * (18 + 4 * real (n_of x)) + 1) / ln 2) \ O[?F](\x. ln (real (n_of x)) + ln (real (m_of x)))" by (intro sum_in_bigo, auto simp: l3 l4 unit_6) have l10: "(\x. real (nat \6 / (\_of x)\<^sup>2\) * real (nat \- (18 * ln (real_of_rat (\_of x)))\) * (5 + 4 * ln (8 + 2 * real (n_of x)) / ln 2 + 2 * ln(real (m_of x) * (18 + 4 * real (n_of x)) + 1) / ln 2)) \ O[?F](g)" unfolding g_def by (intro landau_o.mult, auto simp: l1 l2 l9) have "f2_space_usage = (\x. f2_space_usage (n_of x, m_of x, \_of x, \_of x))" by (simp add:case_prod_beta' n_of_def \_of_def \_of_def m_of_def) also have "... \ O[?F](g)" by (auto intro!:sum_in_bigo simp:Let_def log_def l6 l7 l8 l10 unit_8) also have "... = O[?F](?rhs)" by (simp add:case_prod_beta' g_def n_of_def \_of_def \_of_def m_of_def) finally show ?thesis by simp qed end diff --git a/thys/Frequency_Moments/Frequency_Moments.thy b/thys/Frequency_Moments/Frequency_Moments.thy --- a/thys/Frequency_Moments/Frequency_Moments.thy +++ b/thys/Frequency_Moments/Frequency_Moments.thy @@ -1,115 +1,115 @@ section "Frequency Moments" theory Frequency_Moments imports Frequency_Moments_Preliminary_Results - Universal_Hash_Families.Field + Universal_Hash_Families.Universal_Hash_Families_More_Finite_Fields Interpolation_Polynomials_HOL_Algebra.Interpolation_Polynomial_Cardinalities begin text \This section contains a definition of the frequency moments of a stream and a few general results about frequency moments..\ definition F where "F k xs = (\ x \ set xs. (rat_of_nat (count_list xs x)^k))" lemma F_ge_0: "F k as \ 0" unfolding F_def by (rule sum_nonneg, simp) lemma F_gr_0: assumes "as \ []" shows "F k as > 0" proof - have "rat_of_nat 1 \ rat_of_nat (card (set as))" using assms card_0_eq[where A="set as"] by (intro of_nat_mono) (metis List.finite_set One_nat_def Suc_leI neq0_conv set_empty) also have "... = (\x\set as. 1)" by simp also have "... \ (\x\set as. rat_of_nat (count_list as x) ^ k)" by (intro sum_mono one_le_power) (metis count_list_gr_1 of_nat_1 of_nat_le_iff) also have "... \ F k as" by (simp add:F_def) finally show ?thesis by simp qed definition P\<^sub>e :: "nat \ nat \ nat list \ bool list option" where - "P\<^sub>e p n f = (if p > 1 \ f \ bounded_degree_polynomials (Field.mod_ring p) n then - ([0..\<^sub>e Nb\<^sub>e p) (\i \ {..e p n f = (if p > 1 \ f \ bounded_degree_polynomials (mod_ring p) n then + ([0..\<^sub>e Nb\<^sub>e p) (\i \ {..e p n)" proof (cases "p > 1") case True - interpret cring "Field.mod_ring p" + interpret cring "mod_ring p" using mod_ring_is_cring True by blast have a:"inj_on (\x. (\i \ {.. bounded_degree_polynomials (mod_ring p) n" assume c:"y \ bounded_degree_polynomials (mod_ring p) n" assume d:"restrict (coeff x) {.. n" by linarith have "coeff x i = \\<^bsub>mod_ring p\<^esub>" using b e by (subst coeff_length, auto simp:bounded_degree_polynomials_length) also have "... = coeff y i" using c e by (subst coeff_length, auto simp:bounded_degree_polynomials_length) finally show ?thesis by simp qed then show "x = y" using b c univ_poly_carrier by (subst coeff_iff_polynomial_cond) (auto simp:bounded_degree_polynomials_length) qed have "is_encoding (\f. P\<^sub>e p n f)" unfolding P\<^sub>e_def using a True by (intro encoding_compose[where f="([0..\<^sub>e Nb\<^sub>e p)"] fun_encoding bounded_nat_encoding) auto thus ?thesis by simp next case False hence "is_encoding (\f. P\<^sub>e p n f)" unfolding P\<^sub>e_def using encoding_triv by simp then show ?thesis by simp qed lemma bounded_degree_polynomial_bit_count: assumes "p > 1" - assumes "x \ bounded_degree_polynomials (Field.mod_ring p) n" + assumes "x \ bounded_degree_polynomials (mod_ring p) n" shows "bit_count (P\<^sub>e p n x) \ ereal (real n * (log 2 p + 1))" proof - - interpret cring "Field.mod_ring p" + interpret cring "mod_ring p" using mod_ring_is_cring assms by blast have a: "x \ carrier (poly_ring (mod_ring p))" using assms(2) by (simp add:bounded_degree_polynomials_def) have "real_of_int \log 2 (p-1)\+1 \ log 2 (p-1) + 1" using floor_eq_iff by (intro add_mono, auto) also have "... \ log 2 p + 1" using assms by (intro add_mono, auto) finally have b: "\log 2 (p-1)\+1 \ log 2 p + 1" by simp have "bit_count (P\<^sub>e p n x) = (\ k \ [0..e p (coeff x k)))" using assms restrict_extensional by (auto intro!:arg_cong[where f="sum_list"] simp add:P\<^sub>e_def fun_bit_count lessThan_atLeast0) also have "... = (\ k \ [0..log 2 (p-1)\+1)" using assms(1) by (simp add:floorlog_def) also have "... \ ereal (real n * (log 2 p + 1))" by (subst ereal_less_eq, intro mult_left_mono b, auto) finally show ?thesis by simp qed end \ No newline at end of file diff --git a/thys/Frequency_Moments/Product_PMF_Ext.thy b/thys/Frequency_Moments/Product_PMF_Ext.thy --- a/thys/Frequency_Moments/Product_PMF_Ext.thy +++ b/thys/Frequency_Moments/Product_PMF_Ext.thy @@ -1,210 +1,210 @@ section \Indexed Products of Probability Mass Functions\ theory Product_PMF_Ext - imports Main Probability_Ext "HOL-Probability.Product_PMF" Universal_Hash_Families.Preliminary_Results + imports Main Probability_Ext "HOL-Probability.Product_PMF" Universal_Hash_Families.Universal_Hash_Families_More_Independent_Families begin hide_const "Isolated.discrete" text \This section introduces a restricted version of @{term "Pi_pmf"} where the default value is @{term "undefined"} and contains some additional results about that case in addition to @{theory "HOL-Probability.Product_PMF"}\ abbreviation prod_pmf where "prod_pmf I M \ Pi_pmf I undefined M" lemma pmf_prod_pmf: assumes "finite I" shows "pmf (prod_pmf I M) x = (if x \ extensional I then \i \ I. (pmf (M i)) (x i) else 0)" by (simp add: pmf_Pi[OF assms(1)] extensional_def) lemma PiE_defaut_undefined_eq: "PiE_dflt I undefined M = PiE I M" by (simp add:PiE_dflt_def PiE_def extensional_def Pi_def set_eq_iff) blast lemma set_prod_pmf: assumes "finite I" shows "set_pmf (prod_pmf I M) = PiE I (set_pmf \ M)" by (simp add:set_Pi_pmf[OF assms] PiE_defaut_undefined_eq) text \A more general version of @{thm [source] "measure_Pi_pmf_Pi"}.\ lemma prob_prod_pmf': assumes "finite I" assumes "J \ I" shows "measure (measure_pmf (Pi_pmf I d M)) (Pi J A) = (\ i \ J. measure (M i) (A i))" proof - have a:"Pi J A = Pi I (\i. if i \ J then A i else UNIV)" using assms by (simp add:Pi_def set_eq_iff, blast) show ?thesis using assms by (simp add:if_distrib a measure_Pi_pmf_Pi[OF assms(1)] prod.If_cases[OF assms(1)] Int_absorb1) qed lemma prob_prod_pmf_slice: assumes "finite I" assumes "i \ I" shows "measure (measure_pmf (prod_pmf I M)) {\. P (\ i)} = measure (M i) {\. P \}" using prob_prod_pmf'[OF assms(1), where J="{i}" and M="M" and A="\_. Collect P"] by (simp add:assms Pi_def) definition restrict_dfl where "restrict_dfl f A d = (\x. if x \ A then f x else d)" lemma pi_pmf_decompose: assumes "finite I" shows "Pi_pmf I d M = map_pmf (\\. restrict_dfl (\i. \ (f i) i) I d) (Pi_pmf (f ` I) (\_. d) (\j. Pi_pmf (f -` {j} \ I) d M))" proof - have fin_F_I:"finite (f ` I)" using assms by blast have "finite I \ ?thesis" using fin_F_I proof (induction "f ` I" arbitrary: I rule:finite_induct) case empty then show ?case by (simp add:restrict_dfl_def) next case (insert x F) have a: "(f -` {x} \ I) \ (f -` F \ I) = I" using insert(4) by blast have b: "F = f ` (f -` F \ I) " using insert(2,4) by (auto simp add:set_eq_iff image_def vimage_def) have c: "finite (f -` F \ I)" using insert by blast have d: "\j. j \ F \ (f -` {j} \ (f -` F \ I)) = (f -` {j} \ I)" using insert(4) by blast have " Pi_pmf I d M = Pi_pmf ((f -` {x} \ I) \ (f -` F \ I)) d M" by (simp add:a) also have "... = map_pmf (\(g, h) i. if i \ f -` {x} \ I then g i else h i) (pair_pmf (Pi_pmf (f -` {x} \ I) d M) (Pi_pmf (f -` F \ I) d M))" using insert by (subst Pi_pmf_union) auto also have "... = map_pmf (\(g,h) i. if f i = x \ i \ I then g i else if f i \ F \ i \ I then h (f i) i else d) (pair_pmf (Pi_pmf (f -` {x} \ I) d M) (Pi_pmf F (\_. d) (\j. Pi_pmf (f -` {j} \ (f -` F \ I)) d M)))" by (simp add:insert(3)[OF b c] map_pmf_comp case_prod_beta' apsnd_def map_prod_def pair_map_pmf2 b[symmetric] restrict_dfl_def) (metis fst_conv snd_conv) also have "... = map_pmf (\(g,h) i. if i \ I then (h(x := g)) (f i) i else d) (pair_pmf (Pi_pmf (f -` {x} \ I) d M) (Pi_pmf F (\_. d) (\j. Pi_pmf (f -` {j} \ I) d M)))" using insert(4) d by (intro arg_cong2[where f="map_pmf"] ext) (auto simp add:case_prod_beta' cong:Pi_pmf_cong) also have "... = map_pmf (\\ i. if i \ I then \ (f i) i else d) (Pi_pmf (insert x F) (\_. d) (\j. Pi_pmf (f -` {j} \ I) d M))" by (simp add:Pi_pmf_insert[OF insert(1,2)] map_pmf_comp case_prod_beta') finally show ?case by (simp add:insert(4) restrict_dfl_def) qed thus ?thesis using assms by blast qed lemma restrict_dfl_iter: "restrict_dfl (restrict_dfl f I d) J d = restrict_dfl f (I \ J) d" by (rule ext, simp add:restrict_dfl_def) lemma indep_vars_restrict': assumes "finite I" shows "prob_space.indep_vars (Pi_pmf I d M) (\_. discrete) (\i \. restrict_dfl \ (f -` {i} \ I) d) (f ` I)" proof - let ?Q = "(Pi_pmf (f ` I) (\_. d) (\i. Pi_pmf (I \ f -` {i}) d M))" have a:"prob_space.indep_vars ?Q (\_. discrete) (\i \. \ i) (f ` I)" using assms by (intro indep_vars_Pi_pmf, blast) have b: "AE x in measure_pmf ?Q. \i\f ` I. x i = restrict_dfl (\i. x (f i) i) (I \ f -` {i}) d" using assms by (auto simp add:PiE_dflt_def restrict_dfl_def AE_measure_pmf_iff set_Pi_pmf comp_def Int_commute) have "prob_space.indep_vars ?Q (\_. discrete) (\i x. restrict_dfl (\i. x (f i) i) (I \ f -` {i}) d) (f ` I)" by (rule prob_space.indep_vars_cong_AE[OF prob_space_measure_pmf b a], simp) thus ?thesis using prob_space_measure_pmf by (auto intro!:prob_space.indep_vars_distr simp:pi_pmf_decompose[OF assms, where f="f"] map_pmf_rep_eq comp_def restrict_dfl_iter Int_commute) qed lemma indep_vars_restrict_intro': assumes "finite I" assumes "\i \. i \ J \ X' i \ = X' i (restrict_dfl \ (f -` {i} \ I) d)" assumes "J = f ` I" assumes "\\ i. i \ J \ X' i \ \ space (M' i)" shows "prob_space.indep_vars (measure_pmf (Pi_pmf I d p)) M' (\i \. X' i \) J" proof - define M where "M \ measure_pmf (Pi_pmf I d p)" interpret prob_space "M" using M_def prob_space_measure_pmf by blast have "indep_vars (\_. discrete) (\i x. restrict_dfl x (f -` {i} \ I) d) (f ` I)" unfolding M_def by (rule indep_vars_restrict'[OF assms(1)]) hence "indep_vars M' (\i \. X' i (restrict_dfl \ ( f -` {i} \ I) d)) (f ` I)" using assms(4) by (intro indep_vars_compose2[where Y="X'" and N="M'" and M'="\_. discrete"]) (auto simp:assms(3)) hence "indep_vars M' (\i \. X' i \) (f ` I)" using assms(2)[symmetric] by (simp add:assms(3) cong:indep_vars_cong) thus ?thesis unfolding M_def using assms(3) by simp qed lemma fixes f :: "'b \ ('c :: {second_countable_topology,banach,real_normed_field})" assumes "finite I" assumes "i \ I" assumes "integrable (measure_pmf (M i)) f" shows integrable_Pi_pmf_slice: "integrable (Pi_pmf I d M) (\x. f (x i))" and expectation_Pi_pmf_slice: "integral\<^sup>L (Pi_pmf I d M) (\x. f (x i)) = integral\<^sup>L (M i) f" proof - have a:"distr (Pi_pmf I d M) (M i) (\\. \ i) = distr (Pi_pmf I d M) discrete (\\. \ i)" by (rule distr_cong, auto) have b: "measure_pmf.random_variable (M i) borel f" using assms(3) by simp have c:"integrable (distr (Pi_pmf I d M) (M i) (\\. \ i)) f" using assms(1,2,3) by (subst a, subst map_pmf_rep_eq[symmetric], subst Pi_pmf_component, auto) show "integrable (Pi_pmf I d M) (\x. f (x i))" by (rule integrable_distr[where f="f" and M'="M i"]) (auto intro: c) have "integral\<^sup>L (Pi_pmf I d M) (\x. f (x i)) = integral\<^sup>L (distr (Pi_pmf I d M) (M i) (\\. \ i)) f" using b by (intro integral_distr[symmetric], auto) also have "... = integral\<^sup>L (map_pmf (\\. \ i) (Pi_pmf I d M)) f" by (subst a, subst map_pmf_rep_eq[symmetric], simp) also have "... = integral\<^sup>L (M i) f" using assms(1,2) by (simp add: Pi_pmf_component) finally show "integral\<^sup>L (Pi_pmf I d M) (\x. f (x i)) = integral\<^sup>L (M i) f" by simp qed text \This is an improved version of @{thm [source] "expectation_prod_Pi_pmf"}. It works for general normed fields instead of non-negative real functions .\ lemma expectation_prod_Pi_pmf: fixes f :: "'a \ 'b \ ('c :: {second_countable_topology,banach,real_normed_field})" assumes "finite I" assumes "\i. i \ I \ integrable (measure_pmf (M i)) (f i)" shows "integral\<^sup>L (Pi_pmf I d M) (\x. (\i \ I. f i (x i))) = (\ i \ I. integral\<^sup>L (M i) (f i))" proof - have a: "prob_space.indep_vars (measure_pmf (Pi_pmf I d M)) (\_. borel) (\i \. f i (\ i)) I" by (intro prob_space.indep_vars_compose2[where Y="f" and M'="\_. discrete"] prob_space_measure_pmf indep_vars_Pi_pmf assms(1)) auto have "integral\<^sup>L (Pi_pmf I d M) (\x. (\i \ I. f i (x i))) = (\ i \ I. integral\<^sup>L (Pi_pmf I d M) (\x. f i (x i)))" by (intro prob_space.indep_vars_lebesgue_integral prob_space_measure_pmf assms(1,2) a integrable_Pi_pmf_slice) auto also have "... = (\ i \ I. integral\<^sup>L (M i) (f i))" by (intro prod.cong expectation_Pi_pmf_slice assms(1,2)) auto finally show ?thesis by simp qed lemma variance_prod_pmf_slice: fixes f :: "'a \ real" assumes "i \ I" "finite I" assumes "integrable (measure_pmf (M i)) (\\. f \^2)" shows "prob_space.variance (Pi_pmf I d M) (\\. f (\ i)) = prob_space.variance (M i) f" proof - have a:"integrable (measure_pmf (M i)) f" using assms(3) measure_pmf.square_integrable_imp_integrable by auto have b:" integrable (measure_pmf (Pi_pmf I d M)) (\x. (f (x i))\<^sup>2)" by (rule integrable_Pi_pmf_slice[OF assms(2) assms(1)], metis assms(3)) have c:" integrable (measure_pmf (Pi_pmf I d M)) (\x. (f (x i)))" by (rule integrable_Pi_pmf_slice[OF assms(2) assms(1)], metis a) have "measure_pmf.expectation (Pi_pmf I d M) (\x. (f (x i))\<^sup>2) - (measure_pmf.expectation (Pi_pmf I d M) (\x. f (x i)))\<^sup>2 = measure_pmf.expectation (M i) (\x. (f x)\<^sup>2) - (measure_pmf.expectation (M i) f)\<^sup>2" using assms a b c by ((subst expectation_Pi_pmf_slice[OF assms(2,1)])?, simp)+ thus ?thesis using assms a b c by (simp add: measure_pmf.variance_eq) qed lemma Pi_pmf_bind_return: assumes "finite I" shows "Pi_pmf I d (\i. M i \ (\x. return_pmf (f i x))) = Pi_pmf I d' M \ (\x. return_pmf (\i. if i \ I then f i (x i) else d))" using assms by (simp add: Pi_pmf_bind[where d'="d'"]) end diff --git a/thys/Median_Method/Median.thy b/thys/Median_Method/Median.thy --- a/thys/Median_Method/Median.thy +++ b/thys/Median_Method/Median.thy @@ -1,887 +1,887 @@ section \Intervals are Borel measurable\ theory Median imports "HOL-Probability.Probability" "HOL-Library.Multiset" - "Universal_Hash_Families.Preliminary_Results" + "Universal_Hash_Families.Universal_Hash_Families_More_Independent_Families" begin text \This section contains a proof that intervals are Borel measurable, where an interval is defined as a convex subset of linearly ordered space, more precisely, a set is an interval, if for each triple of points $x < y < z$: If $x$ and $z$ are in the set so is $y$. This includes ordinary intervals like @{term "{a..b}"}, @{term "{a<.. definition interval :: "('a :: linorder) set \ bool" where "interval I = (\x y z. x \ I \ z \ I \ x \ y \ y \ z \ y \ I)" definition up_ray :: "('a :: linorder) set \ bool" where "up_ray I = (\x y. x \ I \ x \ y \ y \ I)" lemma up_ray_borel: assumes "up_ray (I :: (('a :: linorder_topology) set))" shows "I \ borel" proof (cases "closed I") case True then show ?thesis using borel_closed by blast next case False hence b:"\ closed I" by blast have "open I" proof (rule Topological_Spaces.openI) fix x assume c:"x \ I" show "\T. open T \ x \ T \ T \ I" proof (cases "\y. y < x \ y \ I") case True then obtain y where a:"y < x \ y \ I" by blast have "open {y<..}" by simp moreover have "x \ {y<..}" using a by simp moreover have "{y<..} \ I" using a assms(1) by (auto simp: up_ray_def) ultimately show ?thesis by blast next case False hence "I \ {x..}" using linorder_not_less by auto moreover have "{x..} \ I" using c assms(1) unfolding up_ray_def by blast ultimately have "I = {x..}" by (rule order_antisym) moreover have "closed {x..}" by simp ultimately have "False" using b by auto then show ?thesis by simp qed qed then show ?thesis by simp qed definition down_ray :: "('a :: linorder) set \ bool" where "down_ray I = (\x y. y \ I \ x \ y \ x \ I)" lemma down_ray_borel: assumes "down_ray (I :: (('a :: linorder_topology) set))" shows "I \ borel" proof - have "up_ray (-I)" using assms by (simp add: up_ray_def down_ray_def, blast) hence "(-I) \ borel" using up_ray_borel by blast thus "I \ borel" by (metis borel_comp double_complement) qed text \Main result of this section:\ lemma interval_borel: assumes "interval (I :: (('a :: linorder_topology) set))" shows "I \ borel" proof (cases "I = {}") case True then show ?thesis by simp next case False then obtain x where a:"x \ I" by blast have "\y z. y \ I \ {x..} \ y \ z \ z \ I \ {x..}" by (metis assms a interval_def IntE UnE Un_Int_eq(1) Un_Int_eq(2) atLeast_iff nle_le order.trans) hence "up_ray (I \ {x..})" using up_ray_def by blast hence b:"I \ {x..} \ borel" using up_ray_borel by blast have "\y z. y \ I \ {..x} \ z \ y \ z \ I \ {..x}" by (metis assms a interval_def UnE UnI1 UnI2 atMost_iff dual_order.trans linorder_le_cases) hence "down_ray (I \ {..x})" using down_ray_def by blast hence c:"I \ {..x} \ borel" using down_ray_borel by blast have "I = (I \ {x..}) \ (I \ {..x})" using a by fastforce then show ?thesis using b c by (metis sets.Int) qed section \Order statistics are Borel measurable\ text \This section contains a proof that order statistics of Borel measurable random variables are themselves Borel measurable. The proof relies on the existence of branch-free comparison-sort algorithms. Given a sequence length these algorithms perform compare-swap operations on predefined pairs of positions. In particular the result of a comparison does not affect future operations. An example for a branch-free comparison sort algorithm is shell-sort and also bubble-sort without the early exit. The advantage of using such a comparison-sort algorithm is that it can be lifted to work on random variables, where the result of a comparison-swap operation on two random variables @{term"X"} and @{term"Y"} can be represented as the expressions @{term "\\. min (X \) (Y \)"} and @{term "\\. max (X \) (Y \)"}. Because taking the point-wise minimum (resp. maximum) of two random variables is still Borel measurable, and because the entire sorting operation can be represented using such compare-swap operations, we can show that all order statistics are Borel measuable.\ fun sort_primitive where "sort_primitive i j f k = (if k = i then min (f i) (f j) else (if k = j then max (f i) (f j) else f k))" fun sort_map where "sort_map f n = fold id [sort_primitive j i. i <- [0.. 'b :: linorder" shows "j < n \ i < j \ sort_map f n i \ sort_map f n j" proof (induction n arbitrary: i j) case 0 then show ?case by simp next case (Suc n) define g where "g = (\k. fold id [sort_primitive j n. j <- [0..i j. j < n \ i < j \ g k i \ g k j) \ (\l. l < k \ g k l \ g k n)" proof (induction k) case 0 then show ?case using Suc by (simp add:g_def del:sort_map.simps) next case (Suc k) have "g (Suc k) = sort_primitive k n (g k)" by (simp add:g_def) then show ?case using Suc apply (cases "g k k \ g k n") apply (simp add:min_def max_def) using less_antisym apply blast apply (cases "g k n \ g k k") apply (simp add:min_def max_def) apply (metis less_antisym max.coboundedI2 max.orderE) by simp qed hence "\i j. j < Suc n \ i < j \ g n i \ g n j" apply (simp add:k_def) using less_antisym by blast moreover have "sort_map f (Suc n) = g n" by (simp add:sort_map_ind g_def del:sort_map.simps) ultimately show ?case using Suc by (simp del:sort_map.simps) qed lemma sort_map_mono: fixes f :: "nat \ 'b :: linorder" shows "j < n \ i \ j \ sort_map f n i \ sort_map f n j" by (metis sort_map_strict_mono eq_iff le_imp_less_or_eq) lemma sort_map_perm: fixes f :: "nat \ 'b :: linorder" shows "image_mset (sort_map f n) (mset [0..(ts :: ((nat \ 'b) \ nat \ 'b)). \i < n. \j < n. ts = sort_primitive i j)" define t :: "((nat \ 'b) \ nat \ 'b) list" where "t = [sort_primitive j i. i <- [0..x f. is_swap x \ image_mset (x f) (mset_set {0.. 'b :: linorder" assume "is_swap x" then obtain i j where x_def: "x = sort_primitive i j" and i_bound: "i < n" and j_bound:"j < n" using is_swap_def by blast define inv where "inv = mset_set {k. k < n \ k \ i \ k \ j}" have b:"{0.. k \ i \ k \ j} \ {i,j}" apply (rule order_antisym, rule subsetI, simp, blast, rule subsetI, simp) using i_bound j_bound by meson have c:"\k. k \# inv \ (x f) k = f k" by (simp add:x_def inv_def) have "image_mset (x f) inv = image_mset f inv" apply (rule multiset_eqI) using c multiset.map_cong0 by force moreover have "image_mset (x f) (mset_set {i,j}) = image_mset f (mset_set {i,j})" apply (cases "i = j") by (simp add:x_def max_def min_def)+ moreover have "mset_set {0..x \ set t. is_swap x) \ image_mset (fold id t f) (mset [0..x. x \ set t \ is_swap x" apply (simp add:t_def is_swap_def) by (meson atLeastLessThan_iff imageE less_imp_le less_le_trans) ultimately have "image_mset (fold id t f) (mset [0.. ('b :: linorder)" shows "map (sort_map f n) [0.. 'a \ ('b :: {linorder_topology, second_countable_topology})" assumes "n \ 1" assumes "j < n" assumes "\i. i < n \ X i \ measurable M borel" shows "(\x. (sort_map (\i. X i x) n) j) \ measurable M borel" proof - have n_ge_0:"n > 0" using assms by simp define is_swap where "is_swap = (\(ts :: ((nat \ 'b) \ nat \ 'b)). \i < n. \j < n. ts = sort_primitive i j)" define t :: "((nat \ 'b) \ nat \ 'b) list" where "t = [sort_primitive j i. i <- [0.. 'a \ 'b) \ bool" where "meas_ptw = (\f. (\k. k < n \ f k \ borel_measurable M))" have ind_step: "\x (g :: nat \ 'a \ 'b). meas_ptw g \ is_swap x \ meas_ptw (\k \. x (\i. g i \) k)" proof - fix x g assume "meas_ptw g" hence a:"\k. k < n \ g k \ borel_measurable M" by (simp add:meas_ptw_def) assume "is_swap x" then obtain i j where x_def:"x=sort_primitive i j" and i_le:"i < n" and j_le:"j < n" by (simp add:is_swap_def, blast) have "\k. k < n \ (\\. x (\i. g i \) k) \ borel_measurable M" proof - fix k assume "k < n" thus " (\\. x (\i. g i \) k) \ borel_measurable M" apply (simp add:x_def) apply (cases "k = i", simp) using a i_le j_le borel_measurable_min apply blast apply (cases "k = j", simp) using a i_le j_le borel_measurable_max apply blast using a by simp qed thus "meas_ptw (\k \. x (\i. g i \) k)" by (simp add:meas_ptw_def) qed have "(\x \ set t. is_swap x) \ meas_ptw (\ k \. (fold id t (\k. X k \)) k)" proof (induction t rule:rev_induct) case Nil then show ?case using assms by (simp add:meas_ptw_def) next case (snoc x xs) have a:"meas_ptw (\k \. fold (\a. a) xs (\k. X k \) k)" using snoc by simp have b:"is_swap x" using snoc by simp show ?case using ind_step[OF a b] by simp qed moreover have "\x. x \ set t \ is_swap x" apply (simp add:t_def is_swap_def) by (meson atLeastLessThan_iff imageE less_imp_le less_le_trans) ultimately show ?thesis using assms by (simp add:t_def[symmetric] meas_ptw_def) qed text \Main results of this section:\ lemma order_statistics_measurable: fixes X :: "nat \ 'a \ ('b :: {linorder_topology, second_countable_topology})" assumes "n \ 1" assumes "j < n" assumes "\i. i < n \ X i \ measurable M borel" shows "(\x. (sort (map (\i. X i x) [0.. measurable M borel" apply (subst sort_map_eq_sort[symmetric]) using assms by (simp add:order_statistics_measurable_aux del:sort_map.simps) definition median :: "nat \ (nat \ ('a :: linorder)) \ 'a" where "median n f = sort (map f [0.. 'a \ ('b :: {linorder_topology, second_countable_topology})" assumes "n \ 1" assumes "\i. i < n \ X i \ measurable M borel" shows "(\x. median n (\i. X i x)) \ measurable M borel" apply (simp add:median_def) apply (rule order_statistics_measurable[OF assms(1) _ assms(2)]) using assms(1) by force+ section \The Median Method\ text \This section contains the proof for the probability that the median of independent random variables will be in an interval with high probability if the individual variables are in the same interval with probability larger than $\frac{1}{2}$. The proof starts with the elementary observation that the median of a seqeuence with $n$ elements is in an interval $I$ if at least half of them are in $I$. This works because after sorting the sequence the elements that will be in the interval must necessarily form a consecutive subsequence, if its length is larger than $\frac{n}{2}$ the median must be in it. The remainder follows the proof in \<^cite>\\\textsection 2.1\ in "alon1999"\ using the Hoeffding inequality to estimate the probability that at least half of the sequence elements will be in the interval $I$.\ lemma interval_rule: assumes "interval I" assumes "a \ x" "x \ b" assumes "a \ I" assumes "b \ I" shows "x \ I" using assms(1) apply (simp add:interval_def) using assms by blast lemma sorted_int: assumes "interval I" assumes "sorted xs" assumes "k < length xs" "i \ j" "j \ k " assumes "xs ! i \ I" "xs ! k \ I" shows "xs ! j \ I" apply (rule interval_rule[where a="xs ! i" and b="xs ! k"]) using assms by (simp add: sorted_nth_mono)+ lemma mid_in_interval: assumes "2*length (filter (\x. x \ I) xs) > length xs" assumes "interval I" assumes "sorted xs" shows "xs ! (length xs div 2) \ I" proof - have "length (filter (\x. x \ I) xs) > 0" using assms(1) by linarith then obtain v where v_1: "v < length xs" and v_2: "xs ! v \ I" by (metis filter_False in_set_conv_nth length_greater_0_conv) define J where "J = {k. k < length xs \ xs ! k \ I}" have card_J_min: "2*card J > length xs" using assms(1) by (simp add:J_def length_filter_conv_card) consider (a) "xs ! (length xs div 2) \ I" | (b) "xs ! (length xs div 2) \ I \ v > (length xs div 2)" | (c) "xs ! (length xs div 2) \ I \ v < (length xs div 2)" by (metis linorder_cases v_2) thus ?thesis proof (cases) case a then show ?thesis by simp next case b have p:"\k. k \ length xs div 2 \ xs ! k \ I" using b v_2 sorted_int[OF assms(2) assms(3) v_1, where j="length xs div 2"] apply simp by blast have "card J \ card {Suc (length xs div 2).. length xs - (Suc (length xs div 2))" using card_atLeastLessThan by metis hence "length xs \ 2*( length xs - (Suc (length xs div 2)))" using card_J_min by linarith hence "False" apply (simp add:nat_distrib) apply (subst (asm) le_diff_conv2) using b v_1 apply linarith by simp then show ?thesis by simp next case c have p:"\k. k \ length xs div 2 \ k < length xs \ xs ! k \ I" using c v_1 v_2 sorted_int[OF assms(2) assms(3), where i ="v" and j="length xs div 2"] apply simp by blast have "card J \ card {0..<(length xs div 2)}" apply (rule card_mono, simp) apply (rule subsetI, simp add:J_def not_less_eq_eq[symmetric]) using p linorder_le_less_linear by blast hence "card J \ (length xs div 2)" using card_atLeastLessThan by simp then show ?thesis using card_J_min by linarith qed qed lemma median_est: assumes "interval I" assumes "2*card {k. k < n \ f k \ I} > n" shows "median n f \ I" proof - have a:"{k. k < n \ f k \ I} = {i. i < n \ map f [0.. I}" apply (rule order_antisym, rule subsetI, simp) by (rule subsetI, simp, metis add_0 diff_zero nth_map_upt) show ?thesis apply (simp add:median_def) apply (rule mid_in_interval[where I="I" and xs="sort (map f [0..i. i \ I \ 0 \ f i \ f i \ g i \ g i \ 1" assumes "\x y. x \ A \ (\i \ I. x i \ y i) \ y \ A" shows "measure (Pi_pmf I d (bernoulli_pmf \ f)) A \ measure (Pi_pmf I d (bernoulli_pmf \ g)) A" (is "?L \ ?R") proof - define q where "q i = pmf_of_list [(0::nat, f i), (1, g i - f i), (2, 1 - g i)]" for i have wf:"pmf_of_list_wf [(0::nat, f i), (1, g i - f i), (2, 1 - g i)]" if "i \ I" for i using assms(2)[OF that] by (intro pmf_of_list_wfI) auto have 0: "bernoulli_pmf (f i) = map_pmf (\x. x = 0) (q i)" (is "?L1 = ?R1") if "i \ I" for i proof - have "0 \ f i" "f i \ 1" using assms(2)[OF that] by auto hence "pmf ?L1 x = pmf ?R1 x" for x unfolding q_def pmf_map measure_pmf_of_list[OF wf[OF that]] by (cases x;simp_all add:vimage_def) thus ?thesis by (intro pmf_eqI) auto qed have 1: "bernoulli_pmf (g i) = map_pmf (\x. x \ {0,1}) (q i)" (is "?L1 = ?R1") if "i \ I" for i proof - have "0 \ g i" "g i \ 1" using assms(2)[OF that] by auto hence "pmf ?L1 x = pmf ?R1 x" for x unfolding q_def pmf_map measure_pmf_of_list[OF wf[OF that]] by (cases x;simp_all add:vimage_def) thus ?thesis by (intro pmf_eqI) auto qed have 2: "(\k. x k = 0) \ A \ (\k. x k = 0 \ x k = Suc 0) \ A" for x by (erule assms(3)) auto have "?L = measure (Pi_pmf I d (\i. map_pmf (\x. x = 0) (q i))) A" unfolding comp_def by (simp add:0 cong: Pi_pmf_cong) also have "... = measure (map_pmf ((\) (\x. x = 0)) (Pi_pmf I (if d then 0 else 2) q)) A" by (intro arg_cong2[where f="measure_pmf.prob"] Pi_pmf_map[OF assms(1)]) auto also have "... = measure (Pi_pmf I (if d then 0 else 2) q) {x. (\k. x k = 0) \ A}" by (simp add:comp_def vimage_def) also have "... \ measure (Pi_pmf I (if d then 0 else 2) q) {x. (\k. x k \ {0,1}) \ A}" using 2 by (intro measure_pmf.finite_measure_mono subsetI) auto also have "... = measure (map_pmf ((\) (\x. x \ {0,1})) (Pi_pmf I (if d then 0 else 2) q)) A" by (simp add:vimage_def comp_def) also have "... = measure (Pi_pmf I d (\i. map_pmf (\x. x \ {0,1}) (q i))) A" by (intro arg_cong2[where f="measure_pmf.prob"] Pi_pmf_map[OF assms(1), symmetric]) auto also have "... = ?R" unfolding comp_def by (simp add:1 cong: Pi_pmf_cong) finally show ?thesis by simp qed lemma discrete_measure_eqI: assumes "sets M = count_space UNIV" assumes "sets N = count_space UNIV" assumes "countable \" assumes "\x. x \ \ \ emeasure M {x} = emeasure N {x} \ emeasure M {x} \ \" assumes "AE x in M. x \ \" assumes "AE x in N. x \ \" shows "M = N" proof - define E where "E = insert {} ((\x. {x}) ` \)" have 0: "Int_stable E" unfolding E_def by (intro Int_stableI) auto have 1: "countable E" using assms(3) unfolding E_def by simp have "E \ Pow \" unfolding E_def by auto have "emeasure M A = emeasure N A" if A_range: "A \ E" for A using that assms(4) unfolding E_def by auto moreover have "sets M = sets N" using assms(1,2) by simp moreover have "\ \ sets M" using assms(1) by simp moreover have "E \ {}" unfolding E_def by simp moreover have "\E = \" unfolding E_def by simp moreover have "emeasure M a \ \" if "a \ E" for a using that assms(4) unfolding E_def by auto moreover have "sets (restrict_space M \) = Pow \" using assms(1) by (simp add:sets_restrict_space range_inter) moreover have "sets (restrict_space N \) = Pow \" using assms(2) by (simp add:sets_restrict_space range_inter) moreover have "sigma_sets \ E = Pow \" unfolding E_def by (intro sigma_sets_singletons_and_empty assms(3)) ultimately show ?thesis by (intro measure_eqI_restrict_generator[OF 0 _ _ _ _ _ _ assms(5,6) 1]) auto qed text \Main results of this section:\ text \The next theorem establishes a bound for the probability of the median of independent random variables using the binomial distribution. In a follow-up step, we will establish tail bounds for the binomial distribution and corresponding median bounds. This two-step strategy was suggested by Yong Kiam Tan. In a previous version, I only had verified the exponential tail bound (see theorem \verb+median_bound+ below).\ theorem (in prob_space) median_bound_raw: fixes I :: "('b :: {linorder_topology, second_countable_topology}) set" assumes "n > 0" "p \ 0" assumes "interval I" assumes "indep_vars (\_. borel) X {0..i. i < n \ \

(\ in M. X i \ \ I) \ p" shows "\

(\ in M. median n (\i. X i \) \ I) \ 1 - measure (binomial_pmf n p) {..n div 2}" (is "?L \ ?R") proof - let ?pi = "Pi_pmf {..(\ in M. X i \ \ I)" for i have n_ge_1: "n \ 1" using assms(1) by simp have 0: "{k. k < n \ (k < n \ X k \ \ I)} = {k. k < n \ X k \ \ I}" for \ by auto have "countable ({..\<^sub>E (UNIV :: bool set))" by (intro countable_PiE) auto hence countable_ext: "countable (extensional {.. bool) set)" unfolding PiE_def by auto have m0: "I \ sets borel" using interval_borel[OF assms(3)] by simp have m1: "random_variable borel (\x. X k x)" if "k \ {..x. x \ I) \ borel \\<^sub>M (measure_pmf ((bernoulli_pmf \ q) k))" for k using m0 by measurable hence m3: "random_variable (measure_pmf ((bernoulli_pmf \ q) k)) (\x. X k x \ I)" if "k \ {.. q)) (\\. \k\{.. \ I)" by (intro measurable_restrict) auto moreover have "A \ sets (Pi\<^sub>M {..x. measure_pmf (bernoulli_pmf (q x))))" if "A \ extensional {..a \ A. {a})" by auto also have "... = (\a \ A. PiE {..k. {a k}))" using that by (intro arg_cong[where f="Union"] image_cong refl PiE_singleton[symmetric]) auto also have "... \ sets (Pi\<^sub>M {..x. measure_pmf (bernoulli_pmf (q x))))" using that countable_ext countable_subset by (intro sets.countable_Union countable_image image_subsetI sets_PiM_I_finite) auto finally show ?thesis by simp qed hence m5: "id \ (PiM {.. q)) \\<^sub>M (count_space UNIV)" by (intro measurableI) (simp_all add:vimage_def space_PiM PiE_def) ultimately have "random_variable (count_space UNIV) (id \ (\\. \k\{.. \ I))" by (rule measurable_comp) hence m6: "random_variable (count_space UNIV) (\\. \k\{.. \ I)" by simp have indep: "indep_vars (bernoulli_pmf \ q) (\i x. X i x \ I) {0.. space M. (X k x \ I) = \} = measure (bernoulli_pmf (q k)) {\}" if "k < n" for \ k proof (cases "\") case True then show ?thesis unfolding q_def by (simp add:measure_pmf_single) next case False have "{x \ space M. X k x \ I} \ events" using that m0 by (intro measurable_sets_Collect[OF m1]) auto hence "prob {x \ space M. X k x \ I} = 1 - prob {\ \ space M. X k \ \ I}" by (subst prob_neg) auto thus ?thesis using False unfolding q_def by (simp add:measure_pmf_single) qed hence 1: "emeasure M {x \ space M. (X k x \ I) = \} = emeasure (bernoulli_pmf (q k)) {\}" if "k < n" for \ k using that unfolding emeasure_eq_measure measure_pmf.emeasure_eq_measure by simp interpret product_sigma_finite "(bernoulli_pmf \ q)" by standard have "distr M (count_space UNIV) (\\. (\k\{.. \ I)) = distr (distr M (PiM {.. q)) (\\. \k\{.. \ I)) (count_space UNIV) id" by (subst distr_distr[OF m5 m4]) (simp add:comp_def) also have "... = distr (PiM {..i. (distr M ((bernoulli_pmf \ q) i) (\\. X i \ \ I) ))) (count_space UNIV) id" using assms(1) indep atLeast0LessThan by (intro arg_cong2[where f="\x y. distr x y id"] iffD1[OF indep_vars_iff_distr_eq_PiM'] m3) auto also have "... = distr (PiM {.. q)) (count_space UNIV) id" using m3 1 by (intro distr_cong PiM_cong refl discrete_measure_eqI[where \="UNIV"]) (simp_all add:emeasure_distr vimage_def Int_def conj_commute) also have "... = ?pi (bernoulli_pmf \ q)" proof (rule discrete_measure_eqI[where \="extensional {..M {.. q)) {x} = emeasure (Pi\<^sub>M {.. q)) (PiE {..k. {x k}))" using PiE_singleton[OF 4] by simp also have "... = (\iq)) (PiE_dflt {..k. {x k}))" unfolding measure_pmf.emeasure_eq_measure by (subst measure_Pi_pmf_PiE_dflt) (simp_all add:prod_ennreal) also have "... = emeasure (Pi_pmf {..q)) {x}" using 4 by (intro arg_cong2[where f="emeasure"]) (auto simp add:PiE_dflt_def extensional_def) finally have "emeasure (Pi\<^sub>M {.. q)) {x} = emeasure (Pi_pmf {.. q)) {x}" by simp thus ?case using 4 by (subst (1 2) emeasure_distr[OF m5]) (simp_all add:vimage_def space_PiM PiE_def) next case 5 have "AE x in Pi\<^sub>M {.. q). x \ extensional {..\. (\k\{.. \ I)) = ?pi (bernoulli_pmf\q)" by simp have 3: "n < 2 * card {k. k < n \ y k}" if "n < 2 * card {k. k < n \ x k}" "\i. i < n \ x i \ y i" for x y proof - have "2 * card {k. k < n \ x k} \ 2 * card {k. k < n \ y k}" using that(2) by (intro mult_left_mono card_mono) auto thus ?thesis using that(1) by simp qed have 4: "0 \ p \ p \ q i \ q i \ 1" if "i < n" for i unfolding q_def using assms(2,5) that by auto have p_range: "p \ {0..1}" using 4[OF assms(1)] by auto have "?R = 1 - measure_pmf.prob (binomial_pmf n p) {k. 2 * k \ n}" by (intro arg_cong2[where f="(-)"] arg_cong2[where f="measure_pmf.prob"]) auto also have "... = measure (binomial_pmf n p) {k. n < 2 * k}" by (subst measure_pmf.prob_compl[symmetric]) (simp_all add:set_diff_eq not_le) also have "... = measure (?pi (bernoulli_pmf \ (\_. p))) {\. n < 2 * card {k. k < n \ \ k}}" using p_range by (subst binomial_pmf_altdef'[where A="{.. measure (?pi (bernoulli_pmf \ q)) {\. n < 2 * card {k. k < n \ \ k}}" using 3 4 by (intro prod_pmf_bernoulli_mono) auto also have "... = \

(\ in distr M (count_space UNIV) (\\. \k\{.. \ I). n<2*card {k. k < n \ \ k})" unfolding 2 by simp also have "... = \

(\ in M. n < 2*card {k. k < n \ X k \ \ I})" by (subst measure_distr[OF m6]) (simp_all add:vimage_def Int_def conj_commute 0) also have "... \ ?L" using median_est[OF assms(3)] m0 m1 by (intro finite_measure_mono measurable_sets_Collect[OF median_measurable[OF n_ge_1]]) auto finally show "?R \ ?L" by simp qed text \Cumulative distribution of the binomial distribution (contributed by Yong Kiam Tan):\ lemma prob_binomial_pmf_upto: assumes "0 \ p" "p \ 1" shows "measure_pmf.prob (binomial_pmf n p) {..m} = sum (\i. real (n choose i) * p^i * (1 - p) ^(n-i)) {0..m}" by (auto simp: pmf_binomial[OF assms] measure_measure_pmf_finite intro!: sum.cong) text \A tail bound for the binomial distribution using Hoeffding's inequality:\ lemma binomial_pmf_tail: assumes "p \ {0..1}" "real k \ real n * p" shows "measure (binomial_pmf n p) {..k} \ exp (- 2 * real n * (p - real k / n)^2)" (is "?L \ ?R") proof (cases "n = 0") case True then show ?thesis by simp next case False let ?A = "{.. where "\ = (\ix. (of_bool (x i) :: real) \ ?pi))" define \ :: real where "\ = \ - k" (* eps \ 0 <-> k \ mu *) have "\ = (\ix. (of_bool x :: real) \ (map_pmf (\\. \ i) ?pi)))" unfolding \_def by simp also have "... = (\ix. (of_bool x :: real) \ (bernoulli_pmf p)))" by (simp add: Pi_pmf_component) also have "... = real n * p" using assms(1) by simp finally have \_alt: "\ = real n * p" by simp have \_ge_0: "\ \ 0" using assms(2) unfolding \_def \_alt by auto have indep: "prob_space.indep_vars ?pi (\_. borel) (\k \. of_bool (\ k)) {..k \. of_bool (\ k)" "\_.0" "\_.1" \ using indep unfolding \_def by (unfold_locales) simp_all have "?L = measure (map_pmf (\f. card {x \ ?A. f x}) ?pi) {..k}" by (intro arg_cong2[where f="measure_pmf.prob"] binomial_pmf_altdef' assms(1)) auto also have "... = \

(\ in ?pi. (\i i)) \ \ - \)" unfolding \_def by (simp add:vimage_def Int_def) also have "... \ exp (- 2 * \\<^sup>2 / (\i2))" using False by (intro Hoeffding_ineq_le \_ge_0) auto also have "... = ?R" unfolding \_def \_alt by (simp add:power2_eq_square field_simps) finally show ?thesis by simp qed theorem (in prob_space) median_bound: fixes n :: nat fixes I :: "('b :: {linorder_topology, second_countable_topology}) set" assumes "interval I" assumes "\ > 0" assumes "\ \ {0<..<1}" assumes "indep_vars (\_. borel) X {0.. - ln \ / (2 * \\<^sup>2)" assumes "\i. i < n \ \

(\ in M. X i \ \ I) \ 1/2+\" shows "\

(\ in M. median n (\i. X i \) \ I) \ 1-\" proof - have "0 < -ln \ / (2 * \\<^sup>2)" using assms by (intro divide_pos_pos) auto also have "... \ real n" using assms by simp finally have "real n > 0" by simp hence n_ge_0:"n > 0" by simp have d0: "real_of_int \real n / 2\ * 2 / real n \ 1" using n_ge_0 by simp linarith hence d1: "real (nat \real n / 2\) \ real n * (1 / 2)" using n_ge_0 by (simp add:field_simps) also have "... \ real n * (1 / 2 + \)" using assms(2) by (intro mult_left_mono) auto finally have d1: "real (nat \real n / 2\) \ real n * (1 / 2 + \)" by simp have "1/2 + \ \ \

(\ in M. X 0 \ \ I)" using n_ge_0 by (intro assms(6)) also have "... \ 1" by simp finally have d2: "1 / 2 + \ \ 1" by simp have d3: "nat \real n / 2\ = n div 2" by linarith have "1 - \ \ 1 - exp (- 2 * real n * \\<^sup>2)" using assms(2,3,5) by (intro diff_mono order.refl iffD1[OF ln_ge_iff]) (auto simp:field_simps) also have "... \ 1 - exp (- 2 * real n * ((1/2+\) - real (nat \real n/2\) / real n)\<^sup>2)" using d0 n_ge_0 assms(2) by (intro diff_mono order.refl iffD2[OF exp_le_cancel_iff] mult_left_mono_neg power_mono) auto also have "... \ 1 - measure (binomial_pmf n (1/2+\)) {..nat \real n/2\}" using assms(2) d1 d2 by (intro diff_mono order.refl binomial_pmf_tail) auto also have "... = 1 - measure (binomial_pmf n (1/2+\)) {..n div 2}" by (simp add:d3) also have "... \ \

(\ in M. median n (\i. X i \) \ I)" using assms(2) by (intro median_bound_raw n_ge_0 assms(1,4,6) add_nonneg_nonneg) auto finally show ?thesis by simp qed text \This is a specialization of the above to closed real intervals.\ corollary (in prob_space) median_bound_1: assumes "\ > 0" assumes "\ \ {0<..<1}" assumes "indep_vars (\_. borel) X {0.. - ln \ / (2 * \\<^sup>2)" assumes "\i \ {0..(\ in M. X i \ \ ({a..b} :: real set)) \ 1/2+\" shows "\

(\ in M. median n (\i. X i \) \ {a..b}) \ 1-\" using assms(5) by (intro median_bound[OF _ assms(1,2,3,4)]) (auto simp:interval_def) text \This is a specialization of the above, where $\alpha = \frac{1}{6}$ and the interval is described using a mid point @{term "\"} and radius @{term "\"}. The choice of $\alpha = \frac{1}{6}$ implies a success probability per random variable of $\frac{2}{3}$. It is a commonly chosen success probability for Monte-Carlo algorithms (cf. \<^cite>\\\textsection 4\ in "baryossef2002"\ or \<^cite>\\\textsection 1\ in "kane2010"\).\ corollary (in prob_space) median_bound_2: fixes \ \ :: real assumes "\ \ {0<..<1}" assumes "indep_vars (\_. borel) X {0.. -18 * ln \" assumes "\i. i < n \ \

(\ in M. abs (X i \ - \) > \) \ 1/3" shows "\

(\ in M. abs (median n (\i. X i \) - \) \ \) \ 1-\" proof - have b:"\i. i < n \ space M - {\ \ space M. X i \ \ {\ - \..\ + \}} = {\ \ space M. abs (X i \ - \) > \}" apply (rule order_antisym, rule subsetI, simp, linarith) by (rule subsetI, simp, linarith) have "\i. i < n \ 1 - \

(\ in M. X i \ \ {\- \..\+\}) \ 1/3" apply (subst prob_compl[symmetric]) apply (measurable) using assms(2) apply (simp add:indep_vars_def) apply (subst b, simp) using assms(4) by simp hence a:"\i. i < n \ \

(\ in M. X i \ \ {\- \..\+\}) \ 2/3" by simp have "1-\ \ \

(\ in M. median n (\i. X i \) \ {\-\..\+\})" apply (rule median_bound_1[OF _ assms(1) assms(2), where \="1/6"], simp) using assms(3) apply (simp add:power2_eq_square) using a by simp also have "... = \

(\ in M. abs (median n (\i. X i \) - \) \ \)" apply (rule arg_cong2[where f="measure"], simp) apply (rule order_antisym, rule subsetI, simp, linarith) by (rule subsetI, simp, linarith) finally show ?thesis by simp qed section \Some additional results about the median\ lemma sorted_mono_map: assumes "sorted xs" assumes "mono f" shows "sorted (map f xs)" using assms apply (simp add:sorted_wrt_map) apply (rule sorted_wrt_mono_rel[where P="(\)"]) by (simp add:mono_def, simp) text \This could be added to @{theory "HOL.List"}:\ lemma map_sort: assumes "mono f" shows "sort (map f xs) = map f (sort xs)" using assms by (intro properties_for_sort sorted_mono_map) auto lemma median_cong: assumes "\i. i < n \ f i = g i" shows "median n f = median n g" apply (cases "n = 0", simp add:median_def) apply (simp add:median_def) apply (rule arg_cong2[where f="(!)"]) apply (rule arg_cong[where f="sort"], rule map_cong, simp, simp add:assms) by simp lemma median_restrict: "median n (\i \ {0.. 0" assumes "mono g" shows "g (median n f) = median n (g \ f)" apply (simp add: median_def del:map_map) apply (subst map_map[symmetric]) apply (subst map_sort[OF assms(2)]) apply (subst nth_map, simp) using assms apply fastforce by simp lemma median_rat: assumes "n > 0" shows "real_of_rat (median n f) = median n (\i. real_of_rat (f i))" apply (subst (2) comp_def[where g="f", symmetric]) apply (rule median_commute_mono[OF assms(1)]) by (simp add: mono_def of_rat_less_eq) lemma median_const: assumes "k > 0" shows "median k (\i \ {0.._. a) [0.._. a) [0.._. a) [0..i \ {0.._. a)" by (subst median_restrict, simp) also have "... = a" using assms by (simp add:median_def a) finally show ?thesis by simp qed end diff --git a/thys/Universal_Hash_Families/Carter_Wegman_Hash_Family.thy b/thys/Universal_Hash_Families/Carter_Wegman_Hash_Family.thy --- a/thys/Universal_Hash_Families/Carter_Wegman_Hash_Family.thy +++ b/thys/Universal_Hash_Families/Carter_Wegman_Hash_Family.thy @@ -1,293 +1,293 @@ section \Carter-Wegman Hash Family\label{sec:carter_wegman}\ theory Carter_Wegman_Hash_Family imports Interpolation_Polynomials_HOL_Algebra.Interpolation_Polynomial_Cardinalities - Preliminary_Results + Universal_Hash_Families_More_Independent_Families begin text \The Carter-Wegman hash family is a generic method to obtain $k$-universal hash families for arbitrary $k$. (There are faster solutions, such as tabulation hashing, which are limited to a specific $k$. See for example \<^cite>\"thorup2010"\.) The construction was described by Wegman and Carter~\<^cite>\"wegman1981"\, it is a hash family between the elements of a finite field and works by choosing randomly a polynomial over the field with degree less than $k$. The hash function is the evaluation of a such a polynomial. Using the property that the fraction of polynomials interpolating a given set of $s \leq k$ points is @{term "1/(card (carrier R)^s)"}, which is shown in \<^cite>\"Interpolation_Polynomials_HOL_Algebra-AFP"\, it is possible to obtain both that the hash functions are $k$-wise independent and uniformly distributed. In the following two locales are introduced, the main reason for both is to make the statements of the theorems and proofs more concise. The first locale @{term "poly_hash_family"} fixes a finite ring $R$ and the probability space of the polynomials of degree less than $k$. Because the ring is not a field, the family is not yet $k$-universal, but it is still possible to state a few results such as the fact that the range of the hash function is a subset of the carrier of the ring. The second locale @{term "carter_wegman_hash_family"} is an extension of the former with the assumption that $R$ is a field with which the $k$-universality follows. The reason for using two separate locales is to support use cases, where the ring is only probably a field. For example if it is the set of integers modulo an approximate prime, in such a situation a subset of the properties of an algorithm using approximate primes would need to be verified even if $R$ is only a ring.\ definition (in ring) "hash x \ = eval \ x" locale poly_hash_family = ring + fixes k :: nat assumes finite_carrier[simp]: "finite (carrier R)" assumes k_ge_0: "k > 0" begin definition space where "space = bounded_degree_polynomials R k" definition M where "M = measure_pmf (pmf_of_set space)" lemma finite_space[simp]:"finite space" unfolding space_def using fin_degree_bounded finite_carrier by simp lemma non_empty_bounded_degree_polynomials[simp]:"space \ {}" unfolding space_def using non_empty_bounded_degree_polynomials by simp text \This is to add @{thm [source] carrier_not_empty} to the simp set in the context of @{locale "poly_hash_family"}:\ lemma non_empty_carrier[simp]: "carrier R \ {}" by (simp add:carrier_not_empty) sublocale prob_space "M" by (simp add:M_def prob_space_measure_pmf) lemma hash_range[simp]: assumes "\ \ space" assumes "x \ carrier R" shows "hash x \ \ carrier R" using assms unfolding hash_def space_def bounded_degree_polynomials_def by (simp, metis eval_in_carrier polynomial_incl univ_poly_carrier) lemma hash_range_2: assumes "\ \ space" shows "(\x. hash x \) ` carrier R \ carrier R" using hash_range assms by auto lemma integrable_M[simp]: fixes f :: "'a list \ 'c::{banach, second_countable_topology}" shows "integrable M f" unfolding M_def by (rule integrable_measure_pmf_finite, simp) end locale carter_wegman_hash_family = poly_hash_family + assumes field_R: "field R" begin sublocale field using field_R by simp abbreviation "field_size \ card (carrier R)" lemma poly_cards: assumes "K \ carrier R" assumes "card K \ k" assumes "y ` K \ (carrier R)" shows "card {\ \ space. (\k \ K. eval \ k = y k)} = field_size^(k-card K)" unfolding space_def using interpolating_polynomials_card[where n="k-card K" and K="K"] assms using finite_carrier finite_subset by fastforce lemma poly_cards_single: assumes "x \ carrier R" assumes "y \ carrier R" shows "card {\ \ space. eval \ x = y} = field_size^(k-1)" using poly_cards[where K="{x}" and y="\_. y", simplified] assms k_ge_0 by simp lemma hash_prob: assumes "K \ carrier R" assumes "card K \ k" assumes "y ` K \ carrier R" shows "prob {\. (\x \ K. hash x \ = y x)} = 1/(real field_size)^card K" proof - have "\ \ carrier R" by simp hence a:"field_size > 0" using finite_carrier card_gt_0_iff by blast have b:"real (card {\ \ space. \x\K. eval \ x = y x}) / real (card space) = 1 / real field_size ^ card K" using a assms(2) apply (simp add: frac_eq_eq poly_cards[OF assms(1,2,3)] power_add[symmetric]) by (simp add:space_def bounded_degree_polynomials_card) show ?thesis unfolding M_def by (simp add:hash_def measure_pmf_of_set Int_def b) qed lemma prob_single: assumes "x \ carrier R" "y \ carrier R" shows "prob {\. hash x \ = y} = 1/(real field_size)" using hash_prob[where K="{x}"] assms finite_carrier k_ge_0 by simp lemma prob_range: assumes [simp]:"x \ carrier R" shows "prob {\. hash x \ \ A} = card (A \ carrier R) / field_size" proof - have "prob {\. hash x \ \ A} = prob (\a \ A \ carrier R. {\. hash x \ = a})" by (rule measure_pmf_eq, auto simp:M_def) also have "... = (\ a \ (A \ carrier R). prob {\. hash x \ = a})" by (rule measure_finite_Union, auto simp:M_def disjoint_family_on_def) also have "... = (\ a \ (A \ carrier R). 1/(real field_size))" by (rule sum.cong, auto simp:prob_single) also have "... = card (A \ carrier R) / field_size" by simp finally show ?thesis by simp qed lemma indep: assumes "J \ carrier R" assumes "card J \ k" shows "indep_vars (\_. discrete) hash J" proof - have "\ \ carrier R" by simp hence card_R_ge_0:"field_size > 0" using card_gt_0_iff finite_carrier by blast have fin_J: "finite J" using finite_carrier assms(1) finite_subset by blast show ?thesis proof (rule indep_vars_pmf[OF M_def]) fix a fix J' assume a: "J' \ J" "finite J'" have card_J': "card J' \ k" by (metis card_mono order_trans a(1) assms(2) fin_J) have J'_in_carr: "J' \ carrier R" by (metis order_trans a(1) assms(1)) show "prob {\. \x\J'. hash x \ = a x} = (\x\J'. prob {\. hash x \ = a x})" proof (cases "a ` J' \ carrier R") case True have a_carr: "\x. x \ J' \ a x \ carrier R" using True by force have "prob {\. \x\J'. hash x \ = a x} = real (card {\ \ space. \x\J'. eval \ x = a x}) / real (card space)" by (simp add:M_def measure_pmf_of_set Int_def hash_def) also have "... = real (field_size ^ (k - card J')) / real (card space)" using True by (simp add: poly_cards[OF J'_in_carr card_J']) also have "... = real field_size ^ (k - card J') / real field_size ^ k" by (simp add:space_def bounded_degree_polynomials_card) also have "... = real field_size ^ ((k - 1) * card J') / real field_size ^ (k * card J')" using card_J' by (simp add:power_add[symmetric] power_mult[symmetric] diff_mult_distrib frac_eq_eq add.commute) also have "... = (real field_size ^ (k - 1)) ^ card J' / (real field_size ^ k) ^ card J'" by (simp add:power_add power_mult) also have "... = (\x\J'. real (card {\ \ space. eval \ x = a x}) / real (card space))" using a_carr poly_cards_single[OF subsetD[OF J'_in_carr]] by (simp add:space_def bounded_degree_polynomials_card power_divide) also have "... = (\x\J'. prob {\. hash x \ = a x})" by (simp add:measure_pmf_of_set M_def Int_def hash_def) finally show ?thesis by simp next case False then obtain j where j_def: "j \ J'" "a j \ carrier R" by blast have "{\ \ space. hash j \ = a j} \ {\ \ space. hash j \ \ carrier R}" by (rule subsetI, simp add:j_def) also have "... \ {}" using j_def(1) J'_in_carr hash_range by blast finally have b:"{\ \ space. hash j \ = a j} = {}" by simp hence "real (card ({\ \ space. hash j \ = a j})) = 0" by simp hence "(\x\J'. real (card {\ \ space. hash x \ = a x})) = 0" using a(2) prod_zero[OF a(2)] j_def(1) by auto moreover have "{\ \ space. \x\J'. hash x \ = a x} \ {\ \ space. hash j \ = a j}" using j_def by blast hence "{\ \ space. \x\J'. hash x \ = a x} = {}" using b by blast ultimately show ?thesis by (simp add:measure_pmf_of_set M_def Int_def prod_dividef) qed qed qed lemma k_wise_indep: "k_wise_indep_vars k (\_. discrete) hash (carrier R)" unfolding k_wise_indep_vars_def using indep by simp lemma inj_if_degree_1: assumes "\ \ space" assumes "degree \ = 1" shows "inj_on (\x. hash x \) (carrier R)" using assms eval_inj_if_degree_1 by (simp add:M_def space_def bounded_degree_polynomials_def hash_def) lemma uniform: assumes "i \ carrier R" shows "uniform_on (hash i) (carrier R)" proof - have a: "\a. prob {\. hash i \ \ {a}} = indicat_real (carrier R) a / real field_size" by (subst prob_range[OF assms], simp add:indicator_def) show ?thesis by (rule uniform_onI, use a M_def in auto) qed text \This the main result of this section - the Carter-Wegman hash family is $k$-universal.\ theorem k_universal: "k_universal k hash (carrier R) (carrier R)" using uniform k_wise_indep by (simp add:k_universal_def) end lemma poly_hash_familyI: assumes "ring R" assumes "finite (carrier R)" assumes "0 < k" shows "poly_hash_family R k" using assms by (simp add:poly_hash_family_def poly_hash_family_axioms_def) lemma carter_wegman_hash_familyI: assumes "field F" assumes "finite (carrier F)" assumes "0 < k" shows "carter_wegman_hash_family F k" using assms field.is_ring[OF assms(1)] poly_hash_familyI by (simp add:carter_wegman_hash_family_def carter_wegman_hash_family_axioms_def) lemma hash_k_wise_indep: assumes "field F \ finite (carrier F)" assumes "1 \ n" shows "prob_space.k_wise_indep_vars (pmf_of_set (bounded_degree_polynomials F n)) n (\_. pmf_of_set (carrier F)) (ring.hash F) (carrier F)" proof - interpret carter_wegman_hash_family "F" "n" using assms carter_wegman_hash_familyI by force have "k_wise_indep_vars n (\_. pmf_of_set (carrier F)) hash (carrier F)" by (rule k_wise_indep_vars_compose[OF k_wise_indep], simp) thus ?thesis by (simp add:M_def space_def) qed lemma hash_prob_single: assumes "field F \ finite (carrier F)" assumes "x \ carrier F" assumes "1 \ n" assumes "y \ carrier F" shows "\

(\ in pmf_of_set (bounded_degree_polynomials F n). ring.hash F x \ = y) = 1/(real (card (carrier F)))" proof - interpret carter_wegman_hash_family "F" "n" using assms carter_wegman_hash_familyI by force show ?thesis using prob_single[OF assms(2,4)] by (simp add:M_def space_def) qed end diff --git a/thys/Universal_Hash_Families/ROOT b/thys/Universal_Hash_Families/ROOT --- a/thys/Universal_Hash_Families/ROOT +++ b/thys/Universal_Hash_Families/ROOT @@ -1,16 +1,16 @@ chapter AFP session Universal_Hash_Families = "HOL-Probability" + options [timeout = 600] sessions "HOL-Algebra" "Finite_Fields" "Interpolation_Polynomials_HOL_Algebra" theories - Definitions - Preliminary_Results + Universal_Hash_Families + Universal_Hash_Families_More_Independent_Families Carter_Wegman_Hash_Family - Field + Universal_Hash_Families_More_Finite_Fields document_files "root.tex" "root.bib" diff --git a/thys/Universal_Hash_Families/Definitions.thy b/thys/Universal_Hash_Families/Universal_Hash_Families.thy rename from thys/Universal_Hash_Families/Definitions.thy rename to thys/Universal_Hash_Families/Universal_Hash_Families.thy --- a/thys/Universal_Hash_Families/Definitions.thy +++ b/thys/Universal_Hash_Families/Universal_Hash_Families.thy @@ -1,81 +1,81 @@ section \Introduction and Definition\ -theory Definitions +theory Universal_Hash_Families imports "HOL-Probability.Independent_Family" begin text \Universal hash families are commonly used in randomized algorithms and data structures to randomize the input of algorithms, such that probabilistic methods can be employed without requiring any assumptions about the input distribution. If we regard a family of hash functions from a domain $D$ to a finite range $R$ as a uniform probability space, then the family is $k$-universal if: \begin{itemize} \item For each $x \in D$ the evaluation of the functions at $x$ forms a uniformly distributed random variable on $R$. \item The evaluation random variables for $k$ or fewer distinct domain elements form an independent family of random variables. \end{itemize} This definition closely follows the definition from Vadhan~\<^cite>\\\textsection 3.5.5\ in "vadhan2012"\, with the minor modification that independence is required not only for exactly $k$, but also for \emph{fewer} than $k$ distinct domain elements. The correction is due to the fact that in the corner case where $D$ has fewer than $k$ elements, the second part of their definition becomes void. In the formalization this helps avoid an unnecessary assumption in the theorems. The following definition introduces the notion of $k$-wise independent random variables:\ definition (in prob_space) k_wise_indep_vars where "k_wise_indep_vars k M' X I = (\J \ I. card J \ k \ finite J \ indep_vars M' X J)" lemma (in prob_space) k_wise_indep_vars_subset: assumes "k_wise_indep_vars k M' X I" assumes "J \ I" assumes "finite J" assumes "card J \ k" shows "indep_vars M' X J" using assms by (simp add:k_wise_indep_vars_def) text \Similarly for a finite non-empty set $A$ the predicate @{term "uniform_on X A"} indicates that the random variable is uniformly distributed on $A$:\ definition (in prob_space) "uniform_on X A = ( distr M (count_space UNIV) X = uniform_measure (count_space UNIV) A \ A \ {} \ finite A \ random_variable (count_space UNIV) X)" lemma (in prob_space) uniform_onD: assumes "uniform_on X A" shows "prob {\ \ space M. X \ \ B} = card (A \ B) / card A" proof - have "prob {\ \ space M. X \ \ B} = prob (X -` B \ space M)" by (subst Int_commute, simp add:vimage_def Int_def) also have "... = measure (distr M (count_space UNIV) X) B" using assms by (subst measure_distr, auto simp:uniform_on_def) also have "... = measure (uniform_measure (count_space UNIV) A) B" using assms by (simp add:uniform_on_def) also have "... = card (A \ B) / card A" using assms by (subst measure_uniform_measure, auto simp:uniform_on_def)+ finally show ?thesis by simp qed text \With the two previous definitions it is possible to define the $k$-universality condition for a family of hash functions from $D$ to $R$:\ definition (in prob_space) "k_universal k X D R = ( k_wise_indep_vars k (\_. count_space UNIV) X D \ (\i \ D. uniform_on (X i) R))" text \Note: The definition is slightly more generic then the informal specification from above. This is because usually a family is formed by a single function with a variable seed parameter. Instead of choosing a random function from a probability space, a random seed is chosen from the probability space which parameterizes the hash function. The following section contains some preliminary results about independent families of random variables. Section~\ref{sec:carter_wegman} introduces the Carter-Wegman hash family, which is an explicit construction of $k$-universal families for arbitrary $k$ using polynomials over finite fields. The last section contains a proof that the factor ring of the integers modulo a prime ideal is a finite field, followed by an isomorphic construction of prime fields over an initial segment of the natural numbers.\ end diff --git a/thys/Universal_Hash_Families/Field.thy b/thys/Universal_Hash_Families/Universal_Hash_Families_More_Finite_Fields.thy rename from thys/Universal_Hash_Families/Field.thy rename to thys/Universal_Hash_Families/Universal_Hash_Families_More_Finite_Fields.thy --- a/thys/Universal_Hash_Families/Field.thy +++ b/thys/Universal_Hash_Families/Universal_Hash_Families_More_Finite_Fields.thy @@ -1,253 +1,253 @@ section \Finite Fields\ -theory Field +theory Universal_Hash_Families_More_Finite_Fields imports - "Finite_Fields.Ring_Characteristic" + Finite_Fields.Ring_Characteristic "HOL-Algebra.Ring_Divisibility" "HOL-Algebra.IntRing" begin text \In some applications it is more convenient to work with natural numbers instead of @{term "ZFact p"} whose elements are cosets. To support that use case the following definition introduces an additive and multiplicative structure on @{term "{.. lemma zfact_iso_0: assumes "n > 0" shows "zfact_iso n 0 = \\<^bsub>ZFact (int n)\<^esub>" proof - let ?I = "Idl\<^bsub>\\<^esub> {int n}" have ideal_I: "ideal ?I \" by (simp add: int.genideal_ideal) interpret i:ideal "?I" "\" using ideal_I by simp interpret s:ring_hom_ring "\" "ZFact (int n)" "(+>\<^bsub>\\<^esub>) ?I" using i.rcos_ring_hom_ring ZFact_def by auto show ?thesis by (simp add:zfact_iso_def ZFact_def) qed lemma zfact_prime_is_field: assumes "Factorial_Ring.prime (p :: nat)" shows "field (ZFact (int p))" using zfact_prime_is_finite_field[OF assms] finite_field_def by auto definition mod_ring :: "nat => nat ring" where "mod_ring n = \ carrier = {.. x y. (x * y) mod n), one = 1, zero = 0, add = (\ x y. (x + y) mod n) \" definition zfact_iso_inv :: "nat \ int set \ nat" where "zfact_iso_inv p = inv_into {.. 0" shows "zfact_iso_inv n \\<^bsub>ZFact (int n)\<^esub> = 0" unfolding zfact_iso_inv_def zfact_iso_0[OF n_ge_0, symmetric] using n_ge_0 by (rule inv_into_f_f[OF zfact_iso_inj], simp add:mod_ring_def) lemma zfact_coset: assumes n_ge_0: "n > 0" assumes "x \ carrier (ZFact (int n))" defines "I \ Idl\<^bsub>\\<^esub> {int n}" shows "x = I +>\<^bsub>\\<^esub> (int (zfact_iso_inv n x))" proof - have "x \ zfact_iso n ` {.. 1" shows "zfact_iso_inv n \ ring_iso (ZFact (int n)) (mod_ring n)" proof (rule ring_iso_memI) interpret r:cring "(ZFact (int n))" using ZFact_is_cring by simp define I where "I = Idl\<^bsub>\\<^esub> {int n}" have n_ge_0: "n > 0" using n_ge_1 by simp interpret i:ideal "I" "\" unfolding I_def using int.genideal_ideal by simp interpret s:ring_hom_ring "\" "ZFact (int n)" "(+>\<^bsub>\\<^esub>) I" using i.rcos_ring_hom_ring ZFact_def I_def by auto show "\x. x \ carrier (ZFact (int n)) \ zfact_iso_inv n x \ carrier (mod_ring n)" proof - fix x assume "x \ carrier (ZFact (int n))" hence "zfact_iso_inv n x \ {.. carrier (mod_ring n)" unfolding mod_ring_def by simp qed show "\x y. x \ carrier (ZFact (int n)) \ y \ carrier (ZFact (int n)) \ zfact_iso_inv n (x \\<^bsub>ZFact (int n)\<^esub> y) = zfact_iso_inv n x \\<^bsub>mod_ring n\<^esub> zfact_iso_inv n y" proof - fix x y assume x_carr: "x \ carrier (ZFact (int n))" define x' where "x' = zfact_iso_inv n x" assume y_carr: "y \ carrier (ZFact (int n))" define y' where "y' = zfact_iso_inv n y" have "x \\<^bsub>ZFact (int n)\<^esub> y = (I +>\<^bsub>\\<^esub> (int x')) \\<^bsub>ZFact (int n)\<^esub> (I +>\<^bsub>\\<^esub> (int y'))" unfolding x'_def y'_def using x_carr y_carr zfact_coset[OF n_ge_0] I_def by simp also have "... = (I +>\<^bsub>\\<^esub> (int x' * int y'))" by simp also have "... = (I +>\<^bsub>\\<^esub> (int ((x' * y') mod n)))" unfolding I_def zmod_int by (rule int_cosetI[OF n_ge_0],simp) also have "... = (I +>\<^bsub>\\<^esub> (x' \\<^bsub>mod_ring n\<^esub> y'))" unfolding mod_ring_def by simp also have "... = zfact_iso n (x' \\<^bsub>mod_ring n\<^esub> y')" unfolding zfact_iso_def I_def by simp finally have a:"x \\<^bsub>ZFact (int n)\<^esub> y = zfact_iso n (x' \\<^bsub>mod_ring n\<^esub> y')" by simp have b:"x' \\<^bsub>mod_ring n\<^esub> y' \ {..\<^bsub>mod_ring n\<^esub> y')) = x' \\<^bsub>mod_ring n\<^esub> y'" unfolding zfact_iso_inv_def by (rule inv_into_f_f[OF zfact_iso_inj[OF n_ge_0] b]) thus "zfact_iso_inv n (x \\<^bsub>ZFact (int n)\<^esub> y) = zfact_iso_inv n x \\<^bsub>mod_ring n\<^esub> zfact_iso_inv n y" using a x'_def y'_def by simp qed show "\x y. x \ carrier (ZFact (int n)) \ y \ carrier (ZFact (int n)) \ zfact_iso_inv n (x \\<^bsub>ZFact (int n)\<^esub> y) = zfact_iso_inv n x \\<^bsub>mod_ring n\<^esub> zfact_iso_inv n y" proof - fix x y assume x_carr: "x \ carrier (ZFact (int n))" define x' where "x' = zfact_iso_inv n x" assume y_carr: "y \ carrier (ZFact (int n))" define y' where "y' = zfact_iso_inv n y" have "x \\<^bsub>ZFact (int n)\<^esub> y = (I +>\<^bsub>\\<^esub> (int x')) \\<^bsub>ZFact (int n)\<^esub> (I +>\<^bsub>\\<^esub> (int y'))" unfolding x'_def y'_def using x_carr y_carr zfact_coset[OF n_ge_0] I_def by simp also have "... = (I +>\<^bsub>\\<^esub> (int x' + int y'))" by simp also have "... = (I +>\<^bsub>\\<^esub> (int ((x' + y') mod n)))" unfolding I_def zmod_int by (rule int_cosetI[OF n_ge_0],simp) also have "... = (I +>\<^bsub>\\<^esub> (x' \\<^bsub>mod_ring n\<^esub> y'))" unfolding mod_ring_def by simp also have "... = zfact_iso n (x' \\<^bsub>mod_ring n\<^esub> y')" unfolding zfact_iso_def I_def by simp finally have a:"x \\<^bsub>ZFact (int n)\<^esub> y = zfact_iso n (x' \\<^bsub>mod_ring n\<^esub> y')" by simp have b:"x' \\<^bsub>mod_ring n\<^esub> y' \ {..\<^bsub>mod_ring n\<^esub> y')) = x' \\<^bsub>mod_ring n\<^esub> y'" unfolding zfact_iso_inv_def by (rule inv_into_f_f[OF zfact_iso_inj[OF n_ge_0] b]) thus "zfact_iso_inv n (x \\<^bsub>ZFact (int n)\<^esub> y) = zfact_iso_inv n x \\<^bsub>mod_ring n\<^esub> zfact_iso_inv n y" using a x'_def y'_def by simp qed have "\\<^bsub>ZFact (int n)\<^esub> = zfact_iso n (\\<^bsub>mod_ring n\<^esub>)" by (simp add:zfact_iso_def ZFact_def I_def[symmetric] mod_ring_def) thus "zfact_iso_inv n \\<^bsub>ZFact (int n)\<^esub> = \\<^bsub>mod_ring n\<^esub>" unfolding zfact_iso_inv_def mod_ring_def using inv_into_f_f[OF zfact_iso_inj] n_ge_1 by simp show "bij_betw (zfact_iso_inv n) (carrier (ZFact (int n))) (carrier (mod_ring n))" using zfact_iso_inv_def mod_ring_def zfact_iso_bij[OF n_ge_0] bij_betw_inv_into by force qed lemma mod_ring_finite: "finite (carrier (mod_ring n))" by (simp add:mod_ring_def) lemma mod_ring_carr: "x \ carrier (mod_ring n) \ x < n" by (simp add:mod_ring_def) lemma mod_ring_is_cring: assumes n_ge_1: "n > 1" shows "cring (mod_ring n)" proof - have n_ge_0: "n > 0" using n_ge_1 by simp interpret cring "ZFact (int n)" using ZFact_is_cring by simp have "cring ((mod_ring n) \ zero := zfact_iso_inv n \\<^bsub>ZFact (int n)\<^esub> \)" by (rule ring_iso_imp_img_cring[OF zfact_iso_inv_is_ring_iso[OF n_ge_1]]) moreover have "(mod_ring n) \ zero := zfact_iso_inv n \\<^bsub>ZFact (int n)\<^esub> \ = mod_ring n" using zfact_iso_inv_0[OF n_ge_0] by (simp add:mod_ring_def) ultimately show ?thesis by simp qed lemma zfact_iso_is_ring_iso: assumes n_ge_1: "n > 1" shows "zfact_iso n \ ring_iso (mod_ring n) (ZFact (int n))" proof - have r:"ring (ZFact (int n))" using ZFact_is_cring cring.axioms(1) by blast interpret s: ring "(mod_ring n)" using mod_ring_is_cring cring.axioms(1) n_ge_1 by blast have n_ge_0: "n > 0" using n_ge_1 by linarith have "inv_into (carrier (ZFact (int n))) (zfact_iso_inv n) \ ring_iso (mod_ring n) (ZFact (int n))" using ring_iso_set_sym[OF r zfact_iso_inv_is_ring_iso[OF n_ge_1]] by simp moreover have "\x. x \ carrier (mod_ring n) \ inv_into (carrier (ZFact (int n))) (zfact_iso_inv n) x = zfact_iso n x" proof - fix x assume "x \ carrier (mod_ring n)" hence "x \ {..If @{term "p"} is a prime than @{term "mod_ring p"} is a field:\ lemma mod_ring_is_field: assumes"Factorial_Ring.prime p" shows "field (mod_ring p)" proof - have p_ge_0: "p > 0" using assms prime_gt_0_nat by blast have p_ge_1: "p > 1" using assms prime_gt_1_nat by blast interpret field "ZFact (int p)" using zfact_prime_is_field[OF assms] by simp have "field ((mod_ring p) \ zero := zfact_iso_inv p \\<^bsub>ZFact (int p)\<^esub> \)" by (rule ring_iso_imp_img_field[OF zfact_iso_inv_is_ring_iso[OF p_ge_1]]) moreover have "(mod_ring p) \ zero := zfact_iso_inv p \\<^bsub>ZFact (int p)\<^esub> \ = mod_ring p" using zfact_iso_inv_0[OF p_ge_0] by (simp add:mod_ring_def) ultimately show ?thesis by simp qed end diff --git a/thys/Universal_Hash_Families/Preliminary_Results.thy b/thys/Universal_Hash_Families/Universal_Hash_Families_More_Independent_Families.thy rename from thys/Universal_Hash_Families/Preliminary_Results.thy rename to thys/Universal_Hash_Families/Universal_Hash_Families_More_Independent_Families.thy --- a/thys/Universal_Hash_Families/Preliminary_Results.thy +++ b/thys/Universal_Hash_Families/Universal_Hash_Families_More_Independent_Families.thy @@ -1,348 +1,348 @@ section \Preliminary Results\ -theory Preliminary_Results +theory Universal_Hash_Families_More_Independent_Families imports - "Definitions" + Universal_Hash_Families "HOL-Probability.Stream_Space" "HOL-Probability.Probability_Mass_Function" begin lemma set_comp_image_cong: assumes "\x. P x \ f x = h (g x)" shows "{f x| x. P x} = h ` {g x| x. P x}" using assms by (auto simp: setcompr_eq_image) lemma (in prob_space) k_wise_indep_vars_compose: assumes "k_wise_indep_vars k M' X I" assumes "\i. i \ I \ Y i \ measurable (M' i) (N i)" shows "k_wise_indep_vars k N (\i x. Y i (X i x)) I" using indep_vars_compose2[where N="N" and X="X" and Y="Y" and M'="M'"] assms by (simp add: k_wise_indep_vars_def subsetD) text \The following two lemmas are of independent interest, they help infer independence of events and random variables on distributions. (Candidates for @{theory "HOL-Probability.Independent_Family"}).\ lemma (in prob_space) indep_sets_distr: fixes A assumes "random_variable N f" defines "F \ (\i. (\a. f -` a \ space M) ` A i)" assumes indep_F: "indep_sets F I" assumes sets_A: "\i. i \ I \ A i \ sets N" shows "prob_space.indep_sets (distr M N f) A I" proof (rule prob_space.indep_setsI) show "\A' J. J \ {} \ J \ I \ finite J \ \j\J. A' j \ A j \ measure (distr M N f) (\ (A' ` J)) = (\j\J. measure (distr M N f) (A' j))" proof - fix A' J assume a:"J \ I" "finite J" "J \ {}" "\j \ J. A' j \ A j" define F' where "F' = (\i. f -` A' i \ space M)" have "\ (F' ` J) = f -` (\ (A' ` J)) \ space M" unfolding set_eq_iff F'_def using a(3) by simp moreover have "\ (A' ` J) \ sets N" by (metis a sets_A sets.finite_INT subset_iff) ultimately have b: "measure (distr M N f) (\ (A' ` J)) = measure M (\ (F' ` J))" by (metis assms(1) measure_distr) have "\j. j \ J \ F' j \ F j" using a(4) F'_def F_def by blast hence c:"measure M (\ (F' ` J)) = (\j\ J. measure M (F' j))" by (metis indep_F indep_setsD a(1,2,3)) have "\j. j \ J \ F' j = f -` A' j \ space M" by (simp add:F'_def) moreover have "\j. j \ J \ A' j \ sets N" using a(1,4) sets_A by blast ultimately have d: "\j. j \ J \ measure M (F' j) = measure (distr M N f) (A' j)" using assms(1) measure_distr by metis show "measure (distr M N f) (\ (A' ` J)) = (\j\J. measure (distr M N f) (A' j))" using b c d by auto qed show "prob_space (distr M N f)" using prob_space_distr assms by blast show "\i. i \ I \ A i \ sets (distr M N f)" using sets_A sets_distr by blast qed lemma (in prob_space) indep_vars_distr: assumes "f \ measurable M N" assumes "\i. i \ I \ X' i \ measurable N (M' i)" assumes "indep_vars M' (\i. (X' i) \ f) I" shows "prob_space.indep_vars (distr M N f) M' X' I" proof - interpret D: prob_space "(distr M N f)" using prob_space_distr[OF assms(1)] by simp have a: "f \ space M \ space N" using assms(1) by (simp add:measurable_def) have "D.indep_sets (\i. {X' i -` A \ space N |A. A \ sets (M' i)}) I" proof (rule indep_sets_distr[OF assms(1)]) have "\i. i \ I \ {(X' i \ f) -` A \ space M |A. A \ sets (M' i)} = (\a. f -` a \ space M) ` {X' i -` A \ space N |A. A \ sets (M' i)}" by (rule set_comp_image_cong, simp add:set_eq_iff, use a in blast) thus "indep_sets (\i. (\a. f -` a \ space M) ` {X' i -` A \ space N |A. A \ sets (M' i)}) I" using assms(3) by (simp add:indep_vars_def2 cong:indep_sets_cong) next fix i assume "i \ I" thus "{X' i -` A \ space N |A. A \ sets (M' i)} \ sets N" using assms(2) measurable_sets by blast qed thus ?thesis using assms by (simp add:D.indep_vars_def2) qed lemma range_inter: "range ((\) F) = Pow F" unfolding image_def by auto text \The singletons and the empty set form an intersection stable generator of a countable discrete $\sigma$-algebra:\ lemma sigma_sets_singletons_and_empty: assumes "countable M" shows "sigma_sets M (insert {} ((\k. {k}) ` M)) = Pow M" proof - have "sigma_sets M ((\k. {k}) ` M) = Pow M" using assms sigma_sets_singletons by auto hence "Pow M \ sigma_sets M (insert {} ((\k. {k}) ` M))" by (metis sigma_sets_subseteq subset_insertI) moreover have "(insert {} ((\k. {k}) ` M)) \ Pow M" by blast hence "sigma_sets M (insert {} ((\k. {k}) ` M)) \ Pow M" by (meson sigma_algebra.sigma_sets_subset sigma_algebra_Pow) ultimately show ?thesis by force qed text \In some of the following theorems, the premise @{term "M = measure_pmf p"} is used. This allows stating theorems that hold for pmfs more concisely, for example, instead of @{term "measure_pmf.prob p A \ measure_pmf.prob p B"} we can just write @{term "M = measure_pmf p \ prob A \ prob B"} in the locale @{locale "prob_space"}.\ lemma prob_space_restrict_space: assumes [simp]:"M = measure_pmf p" shows "prob_space (restrict_space M (set_pmf p))" by (rule prob_spaceI, auto simp:emeasure_restrict_space emeasure_pmf) text \The abbreviation below is used to specify the discrete $\sigma$-algebra on @{term "UNIV"} as a measure space. It is used in places where the existing definitions, such as @{term "indep_vars"}, expect a measure space even though only a \emph{measurable} space is really needed, i.e., in cases where the property is invariant with respect to the actual measure.\ hide_const (open) discrete abbreviation "discrete \ count_space UNIV" lemma (in prob_space) indep_vars_restrict_space: assumes [simp]:"M = measure_pmf p" assumes "prob_space.indep_vars (restrict_space M (set_pmf p)) (\_. discrete) X I" shows "indep_vars (\_. discrete) X I" proof - have a: "id \ restrict_space M (set_pmf p) \\<^sub>M M" by (simp add:measurable_def range_inter sets_restrict_space) have "prob_space.indep_vars (distr (restrict_space M (set_pmf p)) M id) (\_. discrete) X I" using assms a prob_space_restrict_space by (auto intro!:prob_space.indep_vars_distr) moreover have "\A. emeasure (distr (restrict_space M (set_pmf p)) M id) A = emeasure M A" using emeasure_distr[OF a] by (auto simp add: emeasure_restrict_space emeasure_Int_set_pmf) hence "distr (restrict_space M p) M id = M" by (auto intro: measure_eqI) ultimately show ?thesis by simp qed lemma (in prob_space) measure_pmf_eq: assumes "M = measure_pmf p" assumes "\x. x \ set_pmf p \ (x \ P) = (x \ Q)" shows "prob P = prob Q" unfolding assms(1) by (rule measure_eq_AE, rule AE_pmfI[OF assms(2)], auto) text \The following lemma is an intro rule for the independence of random variables defined on pmfs. In that case it is possible, to check the independence of random variables point-wise. The proof relies on the fact that the support of a pmf is countable and the $\sigma$-algebra of such a set can be generated by singletons.\ lemma (in prob_space) indep_vars_pmf: assumes [simp]:"M = measure_pmf p" assumes "\a J. J \ I \ finite J \ prob {\. \i \ J. X i \ = a i} = (\i \ J. prob {\. X i \ = a i})" shows "indep_vars (\_. discrete) X I" proof - interpret R:prob_space "(restrict_space M (set_pmf p))" using prob_space_restrict_space by auto have events_eq_pow: "R.events = Pow (set_pmf p)" by (simp add:sets_restrict_space range_inter) define G where "G = (\i. {{}} \ (\x. {x}) ` (X i ` set_pmf p))" define F where "F = (\i. {X i -` a \ set_pmf p|a. a \ G i})" have sigma_sets_pow: "\i. i \ I \ sigma_sets (X i ` set_pmf p) (G i) = Pow (X i ` set_pmf p)" by (simp add:G_def, metis countable_image countable_set_pmf sigma_sets_singletons_and_empty) have F_in_events: "\i. i \ I \ F i \ Pow (set_pmf p)" unfolding F_def by blast have as_sigma_sets: "\i. i \ I \ {u. \A. u = X i -` A \ set_pmf p} = sigma_sets (set_pmf p) (F i)" proof - fix i assume a:"i \ I" have "\A. X i -` A \ set_pmf p = X i -` (A \ X i ` set_pmf p) \ set_pmf p" by auto hence "{u. \A. u = X i -` A \ set_pmf p} = {X i -` A \ set_pmf p |A. A \ X i ` set_pmf p}" by (metis (no_types, opaque_lifting) inf_le2) also have "... = {X i -` A \ set_pmf p |A. A \ sigma_sets (X i ` set_pmf p) (G i)}" using a by (simp add:sigma_sets_pow) also have "... = sigma_sets (set_pmf p) {X i -` a \ set_pmf p |a. a \ G i}" by (subst sigma_sets_vimage_commute[symmetric], auto) also have "... = sigma_sets (set_pmf p) (F i)" by (simp add:F_def) finally show "{u. \A. u = X i -` A \ set_pmf p} = sigma_sets (set_pmf p) (F i)" by simp qed have F_Int_stable: "\i. i \ I \ Int_stable (F i)" proof (rule Int_stableI) fix i a b assume "i \ I" "a \ F i" "b \ F i" thus "a \ b \ (F i)" unfolding F_def G_def by (cases "a \ b = {}", auto) qed have F_indep_sets:"R.indep_sets F I" proof (rule R.indep_setsI) fix i assume "i \ I" show "F i \ R.events" unfolding F_def events_eq_pow by blast next fix A fix J assume a:"J \ I" "J \ {}" "finite J" "\j\J. A j \ F j" have b: "\j. j \ J \ A j \ set_pmf p" by (metis PowD a(1,4) subsetD F_in_events) obtain x where x_def:"\j. j \ J \ A j = X j -` x j \ set_pmf p \ x j \ G j" using a by (simp add:Pi_def F_def, metis) show "R.prob (\ (A ` J)) = (\j\J. R.prob (A j))" proof (cases "\j \ J. A j = {}") case True hence "\ (A ` J) = {}" by blast then show ?thesis using a True by (simp, metis measure_empty) next case False then have "\j. j \ J \ x j \ {}" using x_def by auto hence "\j. j \ J \ x j \ (\x. {x}) ` X j ` set_pmf p" using x_def by (simp add:G_def) then obtain y where y_def: "\j. j \ J \ x j = {y j}" by (simp add:image_def, metis) have "\ (A ` J) \ set_pmf p" using b a(2) by blast hence "R.prob (\ (A ` J)) = prob (\ j \ J. A j)" by (simp add: measure_restrict_space) also have "... = prob ({\. \j \ J. X j \ = y j})" using a x_def y_def apply (simp add:vimage_def measure_Int_set_pmf) by (rule arg_cong2 [where f="measure"], auto) also have "... = (\ j\ J. prob (A j))" using x_def y_def a assms(2) by (simp add:vimage_def measure_Int_set_pmf) also have "... = (\j\J. R.prob (A j))" using b by (simp add: measure_restrict_space cong:prod.cong) finally show ?thesis by blast qed qed have "R.indep_sets (\i. sigma_sets (set_pmf p) (F i)) I" using R.indep_sets_sigma[simplified] F_Int_stable F_indep_sets by (auto simp:space_restrict_space) hence "R.indep_sets (\i. {u. \A. u = X i -` A \ set_pmf p}) I" by (simp add: as_sigma_sets cong:R.indep_sets_cong) hence "R.indep_vars (\_. discrete) X I" unfolding R.indep_vars_def2 by (simp add:measurable_def sets_restrict_space range_inter) thus ?thesis using indep_vars_restrict_space[OF assms(1)] by simp qed lemma (in prob_space) split_indep_events: assumes "M = measure_pmf p" assumes "indep_vars (\i. discrete) X' I" assumes "K \ I" "finite K" shows "prob {\. \x \ K. P x (X' x \)} = (\x \ K. prob {\. P x (X' x \)})" proof - have [simp]: "space M = UNIV" "events = UNIV" "prob UNIV = 1" by (simp add:assms(1))+ have "indep_vars (\_. discrete) X' K" using assms(2,3) indep_vars_subset by blast hence "indep_events (\x. {\ \ space M. P x (X' x \)}) K" using indep_eventsI_indep_vars by force hence a:"indep_events (\x. {\. P x (X' x \)}) K" by simp have "prob {\. \x \ K. P x (X' x \)} = prob (\x \ K. {\. P x (X' x \)})" by (simp add: measure_pmf_eq[OF assms(1)]) also have "... = (\ x \ K. prob {\. P x (X' x \)})" using a assms(4) by (cases "K = {}", auto simp: indep_events_def) finally show ?thesis by simp qed lemma pmf_of_set_eq_uniform: assumes "finite A" "A \ {}" shows "measure_pmf (pmf_of_set A) = uniform_measure discrete A" proof - have a:"real (card A) > 0" using assms by (simp add: card_gt_0_iff) have b: "\Y. emeasure (pmf_of_set A) Y = emeasure (uniform_measure discrete A) Y" using assms a by (simp add: emeasure_pmf_of_set divide_ennreal ennreal_of_nat_eq_real_of_nat) show ?thesis by (rule measure_eqI, auto simp add: b) qed lemma (in prob_space) uniform_onI: assumes "M = measure_pmf p" assumes "finite A" "A \ {}" assumes "\a. prob {\. X \ = a} = indicator A a / card A" shows "uniform_on X A" proof - have a:"\a. measure_pmf.prob p {x. X x = a} = indicator A a / card A" using assms(1,4) by simp have b:"map_pmf X p = pmf_of_set A" by (rule pmf_eqI, simp add:assms pmf_map vimage_def a) have "distr M discrete X = map_pmf X p" by (simp add: map_pmf_rep_eq assms(1)) also have "... = measure_pmf (pmf_of_set A)" using b by simp also have "... = uniform_measure discrete A" by (rule pmf_of_set_eq_uniform[OF assms(2,3)]) finally have "distr M discrete X = uniform_measure discrete A" by simp moreover have "random_variable discrete X" by (simp add: assms(1)) ultimately show ?thesis using assms(2,3) by (simp add: uniform_on_def) qed end