diff --git a/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx b/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx
new file mode 100644
index 000000000..0402f725d
Binary files /dev/null and b/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx differ
diff --git a/input/InitialPopulations/compile/00_master.do b/input/InitialPopulations/compile/00_master.do
index 8717fdb0a..6078271c1 100644
--- a/input/InitialPopulations/compile/00_master.do
+++ b/input/InitialPopulations/compile/00_master.do
@@ -8,7 +8,7 @@
 * DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
 *						WAS EUL version - UKDA-7215-stata [to wave 7]
 * AUTHORS: 				Daria Popova, Justin van de Ven
-* LAST UPDATE:          30 Apr 2025
+* LAST UPDATE:          1 July 2025 DP 
 ***************************************************************************************
 
 ***************************************************************************************
@@ -33,15 +33,14 @@ set matsize 1000
 
 /**************************************************************************************
 * DEFINE DIRECTORIES
-*************************************************************************************/
+**************************************************************************************/
 
 * Working directory
-*global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations"
-global dir_work "C:\Users\Patryk\Documents\SP_prep_pop"
+//global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations"
+global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations"
 
 * Directory which contains do files
-*global dir_do "${dir_work}/do"
-global dir_do "C:\Users\Patryk\git\SimPathsFork\input\InitialPopulations\compile"
+global dir_do "${dir_work}/do"
 
 * Directory which contains data files 
 global dir_data "${dir_work}/data"
@@ -50,14 +49,12 @@ global dir_data "${dir_work}/data"
 global dir_log "${dir_work}/log"
 
 * Directory which contains UKHLS data
-*global dir_ukhls_data "J:\01 DATA\UK\ukhls\wave14\stata\stata13_se\ukhls"
-//global dir_ukhls_data "D:\Dasha\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls"
-global dir_ukhls_data "C:\Users\Patryk\Documents\SP_prep_pop\ukhls\UKDA-6614-stata\stata\stata13_se\ukhls"
+//global dir_ukhls_data "J:\01 DATA\UK\ukhls\wave13\stata\stata13_se\ukhls"
+global dir_ukhls_data "D:\Dasha\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls"
 
 * Directory which contains WAS data
-*global dir_was_data "J:\01 DATA\UK\was\wave7\stata\stata13_se"
-//global dir_was_data "D:\Dasha\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se"
-global dir_was_data "C:\Users\Patryk\Documents\WAS\UKDA-7215-stata\stata\stata13_se"
+//global dir_was_data "J:\01 DATA\UK\was\wave7\stata\stata13_se"
+global dir_was_data "D:\Dasha\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se"
 
 * Directory which contains original initial popultions 
 global dir_ipop_orig "${dir_work}/original_initial_populations"
@@ -65,7 +62,7 @@ global dir_ipop_orig "${dir_work}/original_initial_populations"
 
 /**************************************************************************************
 * DEFINE OTHER GLOBAL VARIABLES
-*************************************************************************************/
+**************************************************************************************/
 * Define age to become responsible as defined in the simulation
 global age_become_responsible 18
 
@@ -100,7 +97,8 @@ global wealthEndYear = 2019
 
 /**************************************************************************************
 * ROUTE TO WORKER FILES 
-*************************************************************************************/
+**************************************************************************************/
+
 * Prepare simulated and observed data
 do "${dir_do}/01_prepare_UKHLS_pooled_data.do"
 * Process UKHLS data
@@ -112,12 +110,12 @@ do "${dir_do}/04_social_care_provided.do"
 do "${dir_do}/05_create_benefit_units.do"
 * reweight data and slice into yearly segments
 do "${dir_do}/06_reweight_and_slice.do"
-/* impute wealth data for selected years
+* impute wealth data for selected years
 do "${dir_do}/07_was_wealth_data.do"
 forvalues year = $wealthStartYear / $wealthEndYear {
 	global yearWealth = `year'
 	do "${dir_do}/08_wealth_to_ukhls.do"
-}*/
+} 
 do "${dir_do}/09_finalise_input_data.do"
 do "${dir_do}/10_check_yearly_data.do"
 
diff --git a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do
index aa0f5efba..140282d3b 100644
--- a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do
+++ b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do
@@ -6,7 +6,7 @@
 * COUNTRY:              UK
 * DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
 * AUTHORS: 				Daria Popova, Justin van de Ven
-* LAST UPDATE:          14 Jan 2025 DP
+* LAST UPDATE:          30 June 2025 DP 
 * NOTE:					Called from 00_master.do - see master file for further details
 ***************************************************************************************
 
@@ -21,7 +21,7 @@ log using "${dir_log}/01_prepare_UKHLS_pooled_data.log", replace
 
 /**************************************************************************************
 * Select and merge UKHLS data 
-*************************************************************************************/
+**************************************************************************************/
 
 *add variables from the all persons (Household grid) dataset
 foreach w of global UKHLSwaves {
@@ -31,7 +31,7 @@ foreach w of global UKHLSwaves {
 	
 	if (`waveno'<13) {
 		use pidp `w'_ivfho `w'_ivfio `w'_hhorig `w'_buno_dv `w'_dvage `w'_sex `w'_depchl `w'_hidp `w'_pno `w'_pns1pid `w'_pns2pid `w'_month `w'_intdaty_dv ///
-		`w'_mnspid `w'_fnspid `w'_ppid `w'_ppno `w'_sppid `w'_sex_dv `w'_mastat_dv `w'_gor_dv `w'_age_dv  /* `w'_hgbioad1 `w'_hgbioad2 */ ///
+		`w'_mnspid `w'_fnspid `w'_ppid `w'_ppno `w'_sppid `w'_sex_dv `w'_mastat_dv `w'_gor_dv `w'_age_dv   /* `w'_hgbioad1 `w'_hgbioad2 */ ///
 		`w'_intdatd_dv `w'_intdatm_dv `w'_intdaty_dv `w'_ethn_dv using `w'_indall.dta, clear
 	}
 	else {
@@ -62,33 +62,29 @@ foreach w of global UKHLSwaves {
 		`w'_fimnmisc_dv `w'_fimnprben_dv `w'_fimninvnet_dv `w'_fimnsben_dv `w'_fimnlabgrs_dv `w'_fimnpen_dv `w'_jbstat `w'_hiqual_dv `w'_jbhrs ///
 		`w'_j2hrs `w'_jshrs /*`w'_scsfl*/ `w'_scghq1_dv `w'_scghq2_dv `w'_jbsic07_cc `w'_bendis* `w'_scghq1_dv `w'_scghq2_dv ///
 		/*`w'_indinus_lw `w'_indscus_lw `w'_indpxub_xw `w'_indpxui_xw `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* */ `w'_sf12mcs_dv `w'_sf12pcs_dv ///
-        `w'_sclfsato `w'_finnow ///
-		using `w'_indresp.dta, clear
+		`w'_bendis*	using `w'_indresp.dta, clear
 	}
 	else if (`waveno'<6) {
 		use pidp `w'_hidp `w'_pno `w'_buno_dv `w'_jbhrs `w'_jbot `w'_jshrs `w'_scghq1_dv `w'_scghq2_dv `w'_fimngrs_dv `w'_fimnnet_dv `w'_fimnlabnet_dv ///
 		`w'_fimnmisc_dv `w'_fimnprben_dv `w'_fimninvnet_dv `w'_fimnsben_dv `w'_fimnlabgrs_dv `w'_fimnpen_dv `w'_jbstat `w'_hiqual_dv `w'_jbhrs ///
 		`w'_j2hrs `w'_jshrs `w'_scsf1 `w'_scghq1_dv `w'_scghq2_dv `w'_jbsic07_cc `w'_bendis* `w'_scghq1_dv `w'_scghq2_dv ///
 		`w'_indinus_lw `w'_indscus_lw `w'_indpxub_xw /*`w'_indpxui_xw*/ `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* `w'_sf12mcs_dv `w'_sf12pcs_dv ///
-        `w'_sclfsato `w'_finnow ///
-		using `w'_indresp.dta, clear
+		`w'_bendis*	using `w'_indresp.dta, clear
 	}
 	else if (`waveno'<13) {
 		use pidp `w'_hidp `w'_pno `w'_buno_dv `w'_jbhrs `w'_jbot `w'_jshrs `w'_scghq1_dv `w'_scghq2_dv `w'_fimngrs_dv `w'_fimnnet_dv `w'_fimnlabnet_dv ///
 		`w'_fimnmisc_dv `w'_fimnprben_dv `w'_fimninvnet_dv `w'_fimnsben_dv `w'_fimnlabgrs_dv `w'_fimnpen_dv `w'_jbstat `w'_hiqual_dv `w'_jbhrs ///
 		`w'_j2hrs `w'_jshrs `w'_scsf1 `w'_scghq1_dv `w'_scghq2_dv `w'_jbsic07_cc `w'_bendis* `w'_scghq1_dv `w'_scghq2_dv ///
 		`w'_indinus_lw `w'_indscus_lw /*`w'_indpxub_xw*/ `w'_indpxui_xw `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* `w'_sf12mcs_dv `w'_sf12pcs_dv ///
-        `w'_sclfsato `w'_finnow ///
-		using `w'_indresp.dta, clear
+		`w'_bendis*	using `w'_indresp.dta, clear
 	} 
 
 	else if (`waveno'==13) {
 		use pidp `w'_hidp `w'_pno /*`w'_buno_dv*/ `w'_jbhrs `w'_jbot `w'_jshrs `w'_scghq1_dv `w'_scghq2_dv `w'_fimngrs_dv `w'_fimnnet_dv `w'_fimnlabnet_dv ///
 		`w'_fimnmisc_dv `w'_fimnprben_dv `w'_fimninvnet_dv `w'_fimnsben_dv `w'_fimnlabgrs_dv `w'_fimnpen_dv `w'_jbstat `w'_hiqual_dv `w'_jbhrs ///
 		/*`w'_j2hrs*/ `w'_jshrs `w'_scsf1 `w'_scghq1_dv `w'_scghq2_dv `w'_jbsic07_cc `w'_bendis* `w'_scghq1_dv `w'_scghq2_dv ///
-		`w'_indinus_lw `w'_indscus_lw /*`w'_indpxub_xw `w'_indpxui_xw*/ `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* `w'_indpxui_xw `w'_sf12mcs_dv `w'_sf12pcs_dv ///
-        `w'_sclfsato `w'_finnow ///
-		using `w'_indresp.dta, clear 
+		`w'_indinus_lw `w'_indscus_lw /*`w'_indpxub_xw*/ `w'_indpxui_xw `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* `w'_indpxui_xw `w'_sf12mcs_dv `w'_sf12pcs_dv ///
+		`w'_bendis*	using `w'_indresp.dta, clear 
 		gen m_j2hrs=-9 /*m_j2hrs not available in wave 13*/
 	}
 	
@@ -97,7 +93,7 @@ foreach w of global UKHLSwaves {
 		`w'_fimnmisc_dv `w'_fimnprben_dv `w'_fimninvnet_dv `w'_fimnsben_dv `w'_fimnlabgrs_dv `w'_fimnpen_dv `w'_jbstat `w'_hiqual_dv `w'_jbhrs ///
 		/*`w'_j2hrs*/ `w'_jshrs `w'_scsf1 `w'_scghq1_dv `w'_scghq2_dv `w'_jbsic07_cc `w'_bendis* `w'_scghq1_dv `w'_scghq2_dv ///
 		`w'_indinus_lw `w'_indscus_lw /*`w'_indpxub_xw `w'_indpxui_xw*/ `w'_indpxg2_xw `w'_relup `w'_currpart* `w'_lmcbm* `w'_lmcby4* `w'_sf12mcs_dv `w'_sf12pcs_dv ///
-		using `w'_indresp.dta, clear
+		`w'_bendis*	using `w'_indresp.dta, clear
 		gen m_j2hrs=-9 /*m_j2hrs not available in wave 14*/
 	} 
 	
@@ -117,16 +113,16 @@ foreach w of global UKHLSwaves {
 	local waveno=strpos("abcdefghijklmnopqrstuvwxyz","`w'")
 
 	if (`waveno'==1) {
-		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_fihhmnsben_dv `w'_nch02_dv /*`w'_hhdenub_xw `w'_hhdenui_xw*/ `w'_hsownd using `w'_hhresp.dta, clear
+		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv  `w'_nch02_dv /*`w'_hhdenub_xw `w'_hhdenui_xw*/ `w'_hsownd using `w'_hhresp.dta, clear
 	}
 	else if (`waveno'<6) {
-		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_fihhmnsben_dv `w'_nch02_dv `w'_hhdenub_xw /*`w'_hhdenui_xw*/ `w'_hsownd using `w'_hhresp.dta, clear
+		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_nch02_dv `w'_hhdenub_xw /*`w'_hhdenui_xw*/ `w'_hsownd using `w'_hhresp.dta, clear
 	}
 	else if (`waveno'<14) {
-		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_fihhmnsben_dv `w'_nch02_dv /*`w'_hhdenub_xw*/ `w'_hhdenui_xw `w'_hsownd using `w'_hhresp.dta, clear
+		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_nch02_dv /*`w'_hhdenub_xw*/ `w'_hhdenui_xw `w'_hsownd using `w'_hhresp.dta, clear
 	} 
 	else if (`waveno'==14) {
-		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_fihhmnsben_dv `w'_nch02_dv /*`w'_hhdenub_xw `w'_hhdenui_xw*/ `w'_hhdeng2_xw `w'_hsownd using `w'_hhresp.dta, clear
+		use `w'_hidp `w'_fihhmnnet1_dv `w'_fihhmngrs1_dv `w'_nch02_dv /*`w'_hhdenub_xw `w'_hhdenui_xw*/ `w'_hhdeng2_xw `w'_hsownd using `w'_hhresp.dta, clear
 	}
 	
 	gen swv = `waveno'
@@ -140,7 +136,7 @@ foreach w of global UKHLSwaves {
 
 /**************************************************************************************
 * Prepare and merge income variables:
-*************************************************************************************/
+**************************************************************************************/
 foreach w of global UKHLSwaves {
 
 	// find the wave number
@@ -167,29 +163,31 @@ gen inc_tu = frmnthimp_dv if ficode == 25 //Trade Union / Friendly Society Payme
 gen inc_ma = frmnthimp_dv if ficode == 26 //Maintenance or Alimony
 gen inc_fm = frmnthimp_dv if ficode == 27 //payments from a family member not living here
 gen inc_oth = frmnthimp_dv if ficode == 38 //any other regular payment (not asked in Wave 1)
-keep swv pidp hidp inc_pp inc_tu inc_ma inc_fm inc_oth
-drop if missing(inc_pp) & missing(inc_tu) & missing(inc_ma) & missing(inc_fm) & missing(inc_oth)
-collapse (sum) inc_pp inc_tu inc_ma inc_fm inc_oth, by(swv pidp hidp)
+/*			          
+8  Severe Disablement Allowance	
+9  Industrial Injury Disablement Allowance	
+10 Disability Living Allowance	
+11 Attendance Allowance	
+12 Carer's Allowance (formerly Invalid Care	Allowance)				          
+13 War Disablement Pension	
+14 Incapacity Benefit	
+33 Employment and Support Allowance                  
+34 Return to Work Credit                                  
+35 Sickness and Accident Insurance                      
+37 Other Disability Related Benefit or Payment          
+41 Personal Independence Payments                       
+43 Child Disability Payment                               
+44 Adult Disability Payment                              
+45 Pension Age Disability Payment                         
+*/
+gen inc_disab = frmnthimp_dv if (ficode>=8 & ficode<=14) | ficode==33 |  ficode==34 |  ficode==35  |  ficode==37 |  ficode==41 |  ficode==43 |  ficode==44 |  ficode==45 
+
+keep swv pidp hidp inc_pp inc_tu inc_ma inc_fm inc_oth inc_disab
+drop if missing(inc_pp) & missing(inc_tu) & missing(inc_ma) & missing(inc_fm) & missing(inc_oth) & missing(inc_disab)
+collapse (sum) inc_pp inc_tu inc_ma inc_fm inc_oth inc_disab, by(swv pidp hidp)
 save "$dir_data\tmp_income", replace
 restore
 
-/******************************Benefits receipt *****************************/
-
-preserve
-* Generate UC benefit marker
-gen benefits_uc=(ficode==40)
-label var benefits_uc "Universal Credit indicator"
-
-
-keep hidp pidp swv benefits_uc
-collapse (max) benefits_uc, by(hidp swv)
-compress
-
-save "$dir_data/tmp_ucrcpt", replace
-restore
-
-
-
 //merge variables from the youth dataset 9-18 years old * 
 foreach w of global UKHLSwaves {
 
@@ -208,34 +206,13 @@ foreach w of global UKHLSwaves {
 	}
 }
 
-//merge variables from the youth dataset 9-18 years old * 
-
-foreach w of global UKHLSwaves {
-
-	// find the wave number
-	local waveno=strpos("abcdefghijklmnopqrstuvwxyz","`w'")
-
-	if (`waveno'>7 | mod(`waveno',2)==0) {
-		use pidp `w'_hidp `w'_ypsrhlth  using  `w'_youth.dta, clear
-		
-		gen swv = `waveno'
-		rename `w'_* *
-		if (`waveno'>2) {
-			append using "$dir_data\add_vars_ukhls_youth.dta"
-		}
-		save "$dir_data\add_vars_ukhls_youth.dta", replace
-	}
-}
-
-
 
 /**************************************************************************************
 * merge all datasets together 
-*************************************************************************************/
+**************************************************************************************/
 use "$dir_data\add_vars_ukhls.dta", clear
 merge 1:1 pidp hidp swv using "$dir_data\add_vars_ukhls_indresp.dta", keep(1 3) nogen
 merge m:1 hidp swv using "$dir_data\add_vars_ukhls_hhresp.dta", keep(1 3) nogen
-merge m:1 hidp swv using "$dir_data\tmp_ucrcpt", keep(1 3) nogen
 merge 1:1 pidp hidp swv using "$dir_data\tmp_income", keep(1 3) nogen
 merge 1:1 pidp hidp swv using "$dir_data\add_vars_ukhls_youth.dta", keep(1 3) nogen
 
@@ -252,14 +229,14 @@ replace month = 1 if month == -10 // month not available for IEMB (Ethnic Minori
 
 /**************************************************************************************
 * save output
-*************************************************************************************/
+**************************************************************************************/
 save "$dir_data\ukhls_pooled_all_obs_01.dta", replace
 cap log close 
 
 
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	add_vars_ukhls.dta
@@ -276,5 +253,3 @@ foreach file of local files_to_drop {
 }
 
 
-
-
diff --git a/input/InitialPopulations/compile/02_create_UKHLS_variables.do b/input/InitialPopulations/compile/02_create_UKHLS_variables.do
index b066757ce..e7e61a9db 100644
--- a/input/InitialPopulations/compile/02_create_UKHLS_variables.do
+++ b/input/InitialPopulations/compile/02_create_UKHLS_variables.do
@@ -6,7 +6,7 @@
 * COUNTRY:              UK
 * DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
 * AUTHORS: 				Daria Popova, Justin van de Ven
-* LAST UPDATE:          14 Jan 2025 DP
+* LAST UPDATE:          30 June 2025 DP 
 * NOTE:					Called from 00_master.do - see master file for further details
 *						Use -9 for missing values 
 ***************************************************************************************
@@ -23,7 +23,7 @@ lab define dummy 1 "yes" 0 "no"
 
 /**************************************************************************************
 * SAMPLE
-*************************************************************************************/
+**************************************************************************************/
 
 ***Drop IEMB: 
 fre hhorig
@@ -72,7 +72,7 @@ fre ivfio
 
 /**************************************************************************************
 * CREATE REQUIRED VARIABLES
-*************************************************************************************/
+**************************************************************************************/
 
 
 /*****************************SYSTEM VARIABLES*********************************/
@@ -89,17 +89,37 @@ la var stm "Interview year"
 gen Int_Date = mdy(intdatm_dv, intdatd_dv ,intdaty_dv) 
 format Int_Date %d
 
-
 /**************************** HOUSEHOLD IDENTIFIER*****************************/
 clonevar idhh= hidp 
 la var idhh "Household identifier"
 
 
-/********************************* INDIVIDUALS ID*****************************/ 
+/********************************* INDIVIDUALS ID******************************/ 
 clonevar idperson=pidp 
 la var idperson "Unique cross wave identifier"
 
 
+/********************************* EUROMOD style interview date*****************/  
+// If missing, replace with household min values
+bys swv idhh: egen hhintdatd = max(intdatd_dv)
+bys swv idhh: egen hhintdatm = max(intdatm_dv)
+bys swv idhh: egen hhintdaty = max(intdaty_dv)
+
+replace intdatd_dv = hhintdatd if intdatd_dv == -9
+replace intdatm_dv = hhintdatm if intdatm_dv == -9
+replace intdaty_dv = hhintdaty if intdaty_dv == -9
+
+replace intdaty_dv = stm if intdaty_dv == -9
+replace intdatm_dv = 6 if intdatm_dv == -9
+replace intdatd_dv = 15 if intdatd_dv == -9
+
+gen double ddt=intdatd_dv+100*intdatm_dv+10000*intdaty_dv
+
+format ddt %15.0g
+la var ddt "date of interview"
+fre ddt
+
+
 *******************************************************************************
 xtset idperson swv //Set panel
 
@@ -125,12 +145,12 @@ clonevar idpartner=ppid
 la var idpartner "Unique cross wave identifier of partner"
 
 
-/**********************ID FATHER (includes natural/step/adoptive)*************/
+/**********************ID FATHER (includes natural/step/adoptive)**************/
 clonevar idfather= fnspid
 la var idfather "Father unique identifier"
 
 
-/************************ID MOTHER (includes natural/step/adoptive)***********/
+/************************ID MOTHER (includes natural/step/adoptive)************/
 clonevar idmother=mnspid 
 la var idmother "Mother unique identifier"
 
@@ -164,7 +184,7 @@ lab val dun dummy
 //fre dun 
 
 
-/************************* region (NUTS 1) ***********************************/ 
+/************************* region (NUTS 1) ************************************/ 
 //fre gor_dv
 gen drgn1=-9
 replace drgn1=1 if gor_dv==1 
@@ -197,12 +217,12 @@ lab define drgn1 ///
 lab values drgn1 drgn1
 
 
-/***********************country***********************************************/
+/***********************country************************************************/
 gen dct=15
 la var dct "Country code: UK"
 
 
-/**********************Partner's gender***************************************/
+/**********************Partner's gender****************************************/
 duplicates report idpartner swv if idpartner >0
 /*
 Duplicates in terms of idpartner swv
@@ -358,7 +378,6 @@ gen dhm_flag = missing(dhm)
 replace dhm = round(dhm_prediction) if missing(dhm) 
 bys dhm_flag : sum dhm 
 
-
 /**************************Subjective wellbeing (GHQ): Caseness ******************************
 0: not psychologically distressed, scghq2_dv < 4 
 1: psychologically distressed, scghq2_dv >= 4
@@ -430,29 +449,29 @@ gen dhe_pcs_flag = missing(dhe_pcs)
 replace dhe_pcs = round(dhe_pcs_prediction) if missing(dhe_pcs) 
 bys dhe_pcs_flag : sum dhe_pcs
 
-/***************************** Life Satisfaction ***************************************************************************/
-/* Life satisfaction, self report. Continuous scale 0 to 7. */
-
-
-gen dls = sclfsato
-replace dls = . if sclfsato < 0
-lab var dls "DEMOGRAPHIC: Life Satisfaction"
-// fre dls if dag>0 & dag<16
 
+/************Partner's Self-rated health health - mental and physical component***************/
 preserve
-drop if dgn < 0 | dag<0 | dhe<0
-eststo predict_dls: reg dls c.dag i.dgn i.swv i.dhe c.dhm c.dhe_mcs, vce(robust) // Physical and mental health have a big impact, so included as covariate.  
+keep swv idperson dhe_mcs dhe_pcs
+rename idperson idpartner 
+rename dhe_mcs dhe_mcssp 
+rename dhe_pcs dhe_pcssp
+
+save "$dir_data/temp_dhe", replace
 restore
-estimates restore predict_dls
-predict dls_prediction
-// fre dls_prediction
 
-gen dls_flag = missing(dls)
-replace dls = round(dls_prediction) if missing(dls) 
-bys dls_flag : sum dls 
+merge m:1 swv idpartner using "$dir_data/temp_dhe"
+la var dhe_mcssp "Partner's Self-rated health health - mental component"
+la var dhe_pcssp "Partner's Self-rated health health - physical component"
+keep if _merge == 1 | _merge == 3
+drop _merge
+replace dhe_mcssp=-9 if missing(dhe_mcssp) & idpartner>0
+replace dhe_pcssp=-9 if missing(dhe_pcssp) & idpartner>0
+//fre dhe_mcssp dhe_pcssp if idpartner>0
 
 
 /****************************Ehtnicity*****************************************/
+fre ethn_dv
 /*Ethnic group derived from multiple sources such as self-reported as an adult, self-reported as a youth, reported by a household member, and ethnic group of biological parents.
 ethn_dv	-- Ethnic group (derived from multiple sources)
 	-9 missing	
@@ -460,36 +479,52 @@ ethn_dv	-- Ethnic group (derived from multiple sources)
 	2  irish
 	3  gypsy or irish traveller	
 	4  any other white background
+	
 	5  white and black caribbean	
 	6  white and black african	
 	7  white and asian	
 	8  any other mixed background	
+	
 	9  indian		
 	10 pakistani	
 	11 bangladeshi	
 	12 chinese	
 	13 any other asian background	
+	
 	14 caribbean	
 	15 african	
 	16 any other black background	
+	
 	17 arab	
 	97 any other ethnic group  	  
-*/		
-*Note: Missing ethnic group is combined with "Other" 	
+*/	
+
+* definition used in regression estimates 
 cap gen dot = . 
-replace dot = 1 if ethn_dv>=1 & ethn_dv <=4 //white//
-replace dot = 2 if ethn_dv>=5 & ethn_dv<=8 //mixed //
-replace dot = 3 if ethn_dv>=9 & ethn_dv<=13 //asian//
-replace dot = 4 if ethn_dv>=14 & ethn_dv<=16 //black//
-replace dot = 5 if ethn_dv==17 | ethn_dv==97 //other, arab//  
-replace dot = 5 if ethn_dv==-9 //missing// 
-lab var dot "DEMOGRAPHIC: Ethnicity"
-cap label define dot -9 "missing" 1 "White" 2 "Mixed or Multiple ethnic groups" 3 "Asian or Asian British" 4 "Black, Black British, Caribbean, or African" 5 "Other or missing ethnic group"
+replace dot = 1 if ethn_dv>=1 & ethn_dv <=7  //white and mixed with white//
+replace dot = 2 if ethn_dv>=9 & ethn_dv<=13 //asian//
+replace dot = 3 if ethn_dv>=14 & ethn_dv<=16 //black//
+replace dot = 4 if ethn_dv==17 | ethn_dv==97 | ethn_dv==-9 | ethn_dv==8 //arab, mixed non-white, other and missing  
+lab var dot "Ethnicity"
+cap label define dot 1 "White" 2 "Asian or Asian British" 3 "Black, Black British, Caribbean, or African" 4 "Other or missing ethnic group"
 label values dot dot 
-//fre dot 
+fre dot 
 
-
-/******************************Education status*******************************/
+	
+*ONS style definition (but missing is kept as a separate category)  	
+cap gen dot01 = . 
+replace dot01 = 1 if ethn_dv>=1 & ethn_dv <=4 //white//
+replace dot01 = 2 if ethn_dv>=5 & ethn_dv<=8 //mixed //
+replace dot01 = 3 if ethn_dv>=9 & ethn_dv<=13 //asian//
+replace dot01 = 4 if ethn_dv>=14 & ethn_dv<=16 //black//
+replace dot01 = 5 if ethn_dv==17 | ethn_dv==97 //other, arab//  
+replace dot01 = 6 if ethn_dv==-9 //missing// 
+lab var dot01 "Ethnicity"
+cap label define dot01  1 "White" 2 "Mixed or Multiple ethnic groups" 3 "Asian or Asian British" 4 "Black, Black British, Caribbean, or African" 5 "Other ethnic group" 6 "Missing"
+label values dot01 dot01 
+fre dot01 
+
+/******************************Education status********************************/
 *Use hiqual variable, code negative values to missing
 *Low education: Other qualification, no qualification
 *Medium education: Other higher degree, A-level etc, GCSE etc
@@ -675,7 +710,7 @@ replace dcpst = 2 if dag <= 17 & idpartner<0
 //fre dcpst
 
 
-/*****************************Enter partnership*******************************/
+/*****************************Enter partnership********************************/
 sort idperson swv 
 cap drop dcpen
 gen dcpen = -9
@@ -697,13 +732,13 @@ la var dcpex "Exit partnership"
 //fre dcpex
 
 
-/*****************************Age difference partners*************************/
+/*****************************Age difference partners**************************/
 gen dcpagdf = dag - dagsp if (dag > 0 & dagsp > 0) //Leave with negative values? Or should be absolute?
 la var dcpagdf "Partner's age difference"
 
 
-/*********************************Activity status*****************************/
-recode jbstat (1 2 5 12 13 14 = 1 "Employed or self-employed") ///
+/*********************************Activity status******************************/
+recode jbstat (1 2 5 12 13 14 15 = 1 "Employed or self-employed") ///
 	(7 = 2 "Student") ///
 	(3 6 8 10 11 97 9 4 = 3 "Not employed") /// /*includes apprenticeships, unpaid family business, govt training scheme+retired */
 	, into(les_c3)
@@ -726,7 +761,7 @@ lab val les_c4 les_c4
 //tab2 les_c3 les_c4
 
 
-/****************************Partner's activity status:***********************/
+/****************************Partner's activity status:************************/
 preserve
 keep swv idperson idhh les_c3
 rename les_c3 lessp_c3
@@ -755,7 +790,7 @@ drop _merge
 //fre lessp_c4
 
 
-/***********************Own and Spousal Activity Status***********************/
+/***********************Own and Spousal Activity Status************************/
 gen lesdf_c4 = -9
 replace lesdf_c4 = 1 if les_c3 == 1 & lessp_c3 == 1 & dcpst == 1 //Both employed
 replace lesdf_c4 = 2 if les_c3 == 1 & (lessp_c3 == 2 | lessp_c3 == 3) & dcpst == 1 //Employed, spouse not employed
@@ -769,7 +804,7 @@ la var lesdf_c4 "Own and spousal activity status"
 //fre lesdf_c4
 
 
-/******************************Civil servant status***************************/
+/******************************Civil servant status****************************/
 gen lcs=0
 // R.K. (11.05.2017) (we can use SIC 2007 condensed version- this is what Paola does for FRS EUROMOD)
 replace lcs=1 if jbsic07_cc==84
@@ -778,7 +813,7 @@ lab val lcs dummy
 //fre lcs 
 
 
-/***********************************Hours of work*****************************/
+/***********************************Hours of work******************************/
 recode jbhrs (-9/-1 . = .) //is it fine to recode these to 0? don't want to have missing in simulation?
 recode jbot (-9/-1 . = .)
 recode jshrs (-9/-1 . = .)
@@ -788,15 +823,6 @@ replace lhw = ceil(lhw)
 la var lhw "Hours worked per week"
 //fre lhw 
 
-// Lag(1) of hours of work
-xtset // check if xtset correct 
-gen l1_lhw = l1.lhw
-
-replace l1_lhw = lhw if l1.les_c4 == 1 & les_c4 == 1 & missing(l1_lhw) // replace lagged value with current value if employed last period and this period
-replace l1_lhw = lhw if les_c4 == 1 & missing(l1_lhw) // replace lagged value with current value if above not successful 
-replace l1_lhw = 0 if l1.les_c4 != 1 // replace lagged value with zero if not compatible with lagged employment state
-replace l1_lhw = 0 if les_c4 != 1 & missing(l1_lhw) // replace with zero if not working and l1_lhw still missing
-
 
 /*****************************Number of children*******************************/
 //Number of children aged 0-2 (Checked against manually generating count of children 0-2 per HH - same numbers, but nch02_dv distinguishes missing and 0)
@@ -815,7 +841,7 @@ bys swv idhh: egen dnc = sum(depChild)
 la var dnc "Number of dependent children 0 - 18"
 
 
-/*******************************Flag for adult children***********************/
+/*******************************Flag for adult children************************/
 preserve
 keep if dgn == 0
 keep swv idhh idperson dag
@@ -858,31 +884,70 @@ la var dhhtp_c4 "Household composition"
 //fre dhhtp_c4
 
 
-/************************Long-term sick or disabled***************************/
+/************************Long-term sick or disabled****************************/
 gen dlltsd = 0
 replace dlltsd = 1 if jbstat == 8
 sort idperson swv 
 replace dlltsd = 1 if missing(jbstat) & l.jbstat == 8
 //replace dlltsd = 1 if missing(jbstat) & missing(l.jbstat) & l2.jbstat == 8
-la var dlltsd "DEMOGRAPHIC: LT sick or disabled"
+la var dlltsd "DEMOGRAPHIC: LT sick/disabled"
+//fre dlltsd
+
+//check if in receipt of disability benefits 
+/*
+fre bendis1 //Income: Disability benefits: Incapacity Benefit
+fre bendis2 //Income: Disability benefits: Employment and Support Allowance
+fre bendis3 //Income: Disability benefits: Severe Disablement Allowance
+fre bendis4 //Income: Disability benefits: Carer's Allowance
+fre bendis5 //Income: Disability benefits: Disability Living Allowance
+fre bendis6 //Income: Disability benefits: Return to work credit
+fre bendis7 //Income: Disability benefits: Attendance Allowance
+fre bendis8 //Income: Disability benefits: Industrial Injury Disablement Benefit
+fre bendis9 //Income: Disability benefits: War disablement pension
+fre bendis10 //Income: Disability benefits: Sickness and Accident Insurance
+fre bendis11 //Income: Disability benefits: Universal Credit
+fre bendis12 //Income: Disability benefits: Personal Independence Payments
+fre bendis13 //Income: Disability benefits: Child Disability Payment
+fre bendis14 //Income: Disability benefits: Adult Disability Payment
+fre bendis15 //Income: Disability benefits: Pension Age Disability Payment
+fre bendis97 //Income: Disability benefits: Any other disability related benefit or payment
+*/
+gen disben = 0
+replace disben = 1 if inlist(1, bendis1, bendis2, bendis3, bendis4, bendis5, bendis6, bendis7, bendis8, bendis9, ///
+                             bendis10,  bendis12, bendis13, bendis14, bendis15)
+/*Note: exclude bendis11 (Universal credit) as it can be jointly received  and bendis97 (any other) 
+bysort swv idhh (idhh): gen hhsize = _N
+tab2 hhsize disben
+tab2 dlltsd disben */
+
+//second check: disability income based on ficode (disability income is computed in 01_prepare_ukhls_pooled_data)
+gen disben2 = (inc_disab>0 & inc_disab<.) 
+
+//select those who report being didabled & in receipt of disability benefits according to both checks  
+gen dlltsd01 = (dlltsd==1 | (disben==1 & disben2==1)) 
+la var dlltsd01 "DEMOGRAPHIC: LT sick/disabled or receives disability benefits"
+//fre dlltsd01
+//tab2 dlltsd01 dlltsd
 
 
 /*******************Long-term sick or disabled - spouse ***********************/
 preserve
-keep swv idperson dlltsd
+keep swv idperson dlltsd dlltsd01
 rename idperson idpartner
 rename dlltsd dlltsd_sp
+rename dlltsd01 dlltsd01_sp
 save "$dir_data/temp_dlltsd", replace
 restore
 
 merge m:1 swv idpartner using "$dir_data/temp_dlltsd"
-la var dlltsd_sp "Partner's long-term sick"
+la var dlltsd_sp "Partner's long-term sick/disabled"
+la var dlltsd01_sp "Partner's long-term sick/disabled or receives disability benefits"
 keep if _merge == 1 | _merge == 3
 drop _merge
 //fre dlltsd_sp
 
 
-/*******************************Retired***************************************/
+/*******************************Retired****************************************/
 gen dlrtrd = 0
 replace dlrtrd = 1 if jbstat == 4
 sort idperson swv 
@@ -952,7 +1017,7 @@ replace dagpns = 1 if dgn==0 & dag>=63 & stm>=2016 & stm<2018
 replace dagpns = 1 if dgn==0 & dag>=64 & stm>=2018 & stm<2019
 replace dagpns = 1 if dgn==0 & dag>=65 & stm>=2019 & stm<2021
 replace dagpns = 1 if dgn==0 & dag>=66 & stm>=2021 
-
+lab var dagpns "Reached state retirement age"
 
 /****************************Pension age of a spouse***************************/
 preserve
@@ -963,7 +1028,7 @@ save "$dir_data/temp_dagpns", replace
 restore
 merge m:1 swv idpartner idhh using "$dir_data/temp_dagpns"
 keep if _merge == 1 | _merge == 3
-la var dagpns_sp "Pension age - partner"
+la var dagpns_sp "Reached state retirement age - partner"
 drop _merge
 replace dagpns_sp=-9 if idpartner<0
 
@@ -977,7 +1042,7 @@ lab define lesnr_c2 1 "in work" 2 "not in work"
 lab val lesnr_c2 lesnr_c2 
 
 
-/************************Exited parental home*********************************/
+/************************Exited parental home**********************************/
 /*Generated from fnspid and/or mnspid. 1 means that individual no longer lives with a parent (fnspid & mnspid is equal to missing)
  when in the previous wave they lived with a parent  (fnspid or mnspid not equal to missing).*/
 /*
@@ -1005,14 +1070,14 @@ la val sedex dummy
 la var sedex "Left education"
 
 
-/****************************Same-sex partnership*****************************/
+/****************************Same-sex partnership******************************/
 gen ssscp = 0 if idpartner>0
 replace ssscp = 1 if idpartner>0 & (dgn == dgnsp) & dgn>=0 & dgn<. & dgnsp>=0 & dgnsp<.
 la val ssscp dummy
 la var ssscp "Same-sex partnership"
 //fre ssscp
 
-/****************************Year prior to exiting partnership*****************/
+/****************************Year prior to exiting partnership******************/
 cap gen scpexpy = 0
 replace scpexpy = 1 if f.dcpex==1 
 replace scpexpy=-9 if swv==14 //Impossible to know for the most recent wave 
@@ -1028,7 +1093,7 @@ lab val sprfm dummy
 la var sprfm "Woman in fertility range dummy (18- 44)"
 
 
-/************************UK General Fertility Rate: From ONS 2019*************/
+/************************UK General Fertility Rate: From ONS 2019**************/
 /*Source: https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/datasets/birthsummarytables
 for 2023: https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/datasets/birthsinenglandandwalesbirthregistrations
 */
@@ -1087,7 +1152,7 @@ la val sedag dummy
 la var sedag "Educ age range"
 
 
-/****************************Partnership duration*****************************/
+/****************************Partnership duration******************************/
 *idpartner in wave a 
 clonevar idpartner_a=a_ppid
 la var idpartner_a "Unique cross wave identifier of partner in wave a"
@@ -1151,7 +1216,7 @@ la var dcpyy "Years in partnership"
 //by swv: fre dcpyy
 
 
-/**************************OECD equivalence scale*****************************/
+/**************************OECD equivalence scale******************************/
 //Temporary number of children 0-13 and 14-18 to create household OECD equivalence scale
 gen depChild_013 = 1 if (dag >= 0 & dag <= 13) & (pns1pid > 0 | pns2pid > 0) & (depchl_dv == 1)
 gen depChild_1418 = 1 if (dag >= 14 & dag <= 18) & (pns1pid > 0 | pns2pid > 0) & (depchl_dv == 1)
@@ -1188,7 +1253,6 @@ Note: This is supposed to mirror UKMOD market income
    gen inc_tu = frmnthimp_dv if ficode == 25 (Trade Union / Friendly Society Payment)
    gen inc_ma = frmnthimp_dv if ficode == 26 (Maintenance or Alimony)
 Instead of (3) and (4) , use :  
-
 gen inc_pp = frmnthimp_dv if ficode == 4 //A Private Pension/Annuity
 gen inc_tu = frmnthimp_dv if ficode == 25 //Trade Union / Friendly Society Payment
 gen inc_ma = frmnthimp_dv if ficode == 26 //Maintenance or Alimony
@@ -1329,9 +1393,9 @@ gen inc_fm = frmnthimp_dv if ficode == 27 //payments from a family member not li
 gen inc_oth = frmnthimp_dv if ficode == 38 //any other regular payment (not asked in Wave 1)
 */
 recode fimninvnet_dv fimnprben_dv inc_fm inc_oth (-1=.) (-9=.)
-egen ypncp_temp = rowtotal (fimninvnet_dv inc_fm inc_oth fimnprben_dv) 
-assert ypncp_temp>=0
-cap gen ypncp = asinh( ypncp_temp*gross_net_ratio*(1/CPI) ) 
+egen ypncp_lvl = rowtotal (fimninvnet_dv inc_fm inc_oth fimnprben_dv) 
+assert ypncp_lvl>=0
+cap gen ypncp = asinh( ypncp_lvl*gross_net_ratio*(1/CPI) ) //IHS - inverse hyperbolic sine transformation
 lab var ypncp "Gross personal non-employment capital income"
 //sum ypncp
 
@@ -1347,7 +1411,7 @@ lab var  ypnoab "Gross personal private pension income"
 //sum ypnoab
 
 
-/******************************Home ownership dummy***************************/
+/******************************Home ownership dummy****************************/
 *Dhh_owned is the definition used in the initial population and in the model predicting house ownership 
 *in the homeownership process of the simulation.
 bys swv: fre hsownd
@@ -1356,7 +1420,7 @@ replace dhh_owned=1 if hsownd>=1 & hsownd<=3
 lab var dhh_owned "Home ownership dummy"
 
 
-/******************************Disability benefit*****************************/
+/******************************Disability benefit******************************/
 /*If any of bendis1-bendis3, bendis5-bendis12, or bendis97 = 1 then received benefits
 
 Availability of vars across waves: 
@@ -1383,55 +1447,10 @@ la val bdi dummy
 la var bdi "Disability benefits"
 
 
-/******************************Unemployment dummy***************************/
+/******************************Unemployment dummy****************************/
 gen unemp = (jbstat==3)
 label variable unemp "Labour status: unemployed"
 
-/***************************** UC and Non-UC receipt ***********************/
-
-gen econ_benefits = .
-replace econ_benefits = 1 if fihhmnsben_dv > 0 & fihhmnsben_dv!=.
-replace econ_benefits = 0 if fihhmnsben_dv==0
-label var econ_benefits "Household income includes any benefits"
-
-replace benefits_uc=0 if benefits_uc==.
-* Ensure all with known UC receipt also are benefit recipients
-replace econ_benefits=1 if benefits_uc==1
-
-* Generate benefits marker without UC
-gen econ_benefits_nonuc=econ_benefits
-replace econ_benefits_nonuc=0 if benefits_uc==1
-label var econ_benefits_nonuc "Household income includes non-UC benefits"
-
-* Generate benefits marker with UC
-gen econ_benefits_uc=econ_benefits
-replace econ_benefits_uc=0 if benefits_uc==0
-label var econ_benefits_uc "Household income includes UC benefits"
-
-
-/***************************** Financial Distress ***************************************************************************/
-// This is a measure of subjective financial distress, corresponding to answering 4 or 5 to the question below:
-// How well would you say you yourself are managing financially these days? Would you say you are...
-// 1. Living comfortably
-// 2. Doing alright
-// 3. Just about getting by
-// 4. Finding it quite difficult
-// 5. Finding it very difficult
-
-recode finnow (1 2 3 = 0) (4 5 = 1) (else = .), gen(financial_distress)
-lab var financial_distress "DEMOGRAPHIC: Financial Distress"
-
-// Impute financial distress when missing
-preserve
-drop if dgn < 0 | dag < 0 | dhe < 0 | drgn1 < 0
-eststo predict_financial_distress: logit financial_distress c.dag i.dgn i.drgn1 i.swv i.dhe c.dls i.unemp i.dhh_owned c.yhhnb_asinh, vce(robust)
-restore
-estimates restore predict_financial_distress
-predict financial_distress_prediction
-
-replace financial_distress = 1 if missing(financial_distress) & financial_distress_prediction >= 0.5
-replace financial_distress = 0 if missing(financial_distress) & financial_distress_prediction < 0.5
-
 
 /*****************Was in continuous education sample***************************/
 //Generated from age_dv and ded variables. 1 includes first instance of not being in education.
@@ -1450,7 +1469,7 @@ lab define sedcsmpl  1 "Aged 16-29 and were in continuous education"
 lab values sedcsmpl sedcsmpl
 
 
-/**********************Return to education sample*****************************/
+/**********************Return to education sample******************************/
 //Generated from age_dv and drtren 
 gen sedrsmpl =0 
 replace sedrsmpl = 1 if (dag>=16 & dag<=35 & ded==0) 
@@ -1458,7 +1477,7 @@ lab var sedrsmpl "SYSTEM : Return to education sample"
 lab define  sedrsmpl  1 "Aged 16-35 and not in continuous education"
 lab values sedrsmpl sedrsmpl
 
-/**********************In Continuous education sample*************************/
+/**********************In Continuous education sample**************************/
 //Generated from sedcsmpl and ded variables. Sample: Respondents who were in continious education and left it. 
 cap gen scedsmpl = 0 
 replace scedsmpl=1 if sedcsmpl==1 & ded == 0 /*were but currently not in continuous full-time education*/
@@ -1467,7 +1486,7 @@ lab define  scedsmpl  1 "Left continuous education"
 lab values scedsmpl scedsmpl
 
 
-/*****************************Weights*****************************************/
+/*****************************Weights******************************************/
 /*dimlwt	indinus_l DEMOGRAPHIC : Individual Longitudinal Weight - Main survey	
 Longitudinal individual main survey weight from indinus_lw (waves 2 onward) variable*/
 gen dimlwt = indinus_lw 
@@ -1503,27 +1522,25 @@ replace dwt = max_dwt if missing(dhhwt )
 replace dwt = 0 if missing(dwt)
 
 
-/***************************Keep required variables***************************/
+/***************************Keep required variables****************************/
 keep ivfio idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc dgn dgnsp dag dagsq dhe dhesp dcpst  ///
-	ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dcpagdf dlltsd dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq dimlwt disclwt ///
+	ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dcpagdf dlltsd dlltsd01  dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq dimlwt disclwt ///
 	dimxwt dhhwt jbhrs jshrs j2hrs jbstat les_c3 les_c4 lessp_c3 lessp_c4 lesdf_c4 ydses_c5 month scghq2_dv ///
-	ypnbihs_dv yptciihs_dv yplgrs_dv ynbcpdf_dv ypncp ypnoab swv sedex ssscp sprfm sedag stm dagsp lhw l1_lhw pno ppno hgbioad1 hgbioad2 der adultchildflag ///
-        econ_benefits econ_benefits_nonuc econ_benefits_uc ///
-	sedcsmpl sedrsmpl scedsmpl dhh_owned dukfr dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp ypnoab_lvl *_flag  Int_Date dhe_mcs dhe_pcs dls dot unemp financial_distress
+	ypnbihs_dv yptciihs_dv yplgrs_dv ynbcpdf_dv ypncp ypnoab swv sedex ssscp sprfm sedag stm dagsp lhw pno ppno hgbioad1 hgbioad2 der adultchildflag ///
+	sedcsmpl sedrsmpl scedsmpl dhh_owned dukfr dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp dlltsd01_sp ypnoab_lvl *_flag  Int_Date dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp ddt
 
 sort swv idhh idperson 
 
 
 /**************************Recode missing values*******************************/
 foreach var in idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc dgn dgnsp dag dagsq dhe dhesp dcpst ///
-	ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dlltsd dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq ///
+	ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dlltsd dlltsd01 dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq ///
 	jbhrs jshrs j2hrs jbstat les_c3 les_c4 lessp_c3 lessp_c4 lesdf_c4 ydses_c5 scghq2_dv ///
-	ypnbihs_dv yptciihs_dv yplgrs_dv swv sedex ssscp sprfm sedag stm dagsp lhw l1_lhw pno ppno hgbioad1 hgbioad2 der dhh_owned ///
-        econ_benefits econ_benefits_nonuc econ_benefits_uc ///
-	scghq2_dv_miss_flag dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp ypnoab_lvl *_flag dhe_mcs dhe_pcs dls dot unemp {
+	ypnbihs_dv yptciihs_dv yplgrs_dv swv sedex ssscp sprfm sedag stm dagsp lhw pno ppno hgbioad1 hgbioad2 der dhh_owned ///
+	scghq2_dv_miss_flag dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp dlltsd01_sp ypnoab_lvl *_flag dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp {
 		qui recode `var' (-9/-1=-9) (.=-9) 
 }
-
+replace ddt = -9 if ddt<0
 *recode missings in weights to zero. 
 foreach var in dimlwt disclwt dimxwt dhhwt {
 	qui recode `var' (.=0) (-9/-1=0) 
@@ -1541,8 +1558,6 @@ replace l1_potential_earnings_hourly = 0 if missing(l1_potential_earnings_hourly
 		
 * initialise wealth to missing 
 gen liquid_wealth = -9
-gen tot_pen = -9
-gen nvmhome = -9
 gen smp = -9
 gen rnk = -9
 gen mtc = -9
@@ -1554,6 +1569,8 @@ drop if dup == 1 //0 duplicates
 drop dup
 isid idperson idhh swv	
 
+duplicates tag idperson ddt, gen(dup2)
+fre dup2
 
 /*******************************************************************************
 * save the whole pooled dataset that will be used for regression estimates
@@ -1564,7 +1581,7 @@ cap log close
 
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	father_edu.dta
diff --git a/input/InitialPopulations/compile/03_social_care_received.do b/input/InitialPopulations/compile/03_social_care_received.do
index 101f3a31e..24b3ee456 100644
--- a/input/InitialPopulations/compile/03_social_care_received.do
+++ b/input/InitialPopulations/compile/03_social_care_received.do
@@ -5,15 +5,15 @@
 *	AUTH: Justin van de Ven (JV)
 *	LAST EDIT: Daria Popova
 *
-*******************************************************************************/
+********************************************************************************/
 
-**************************************************************************************
+***************************************************************************************
 cap log close 
 log using "${dir_log}/03_social-care_received.log", replace
 ***************************************************************************************
 /********************************************************************************
 	local data directories - commented out when using master program
-*******************************************************************************/
+********************************************************************************/
 
 * define seed to ensure replicatability of results
 global seedBase = 3141592
@@ -38,14 +38,14 @@ matrix careHourlyWageRates = (9.04 \ ///	2010
 
 /**********************************************************************
 *	start analysis
-*********************************************************************/
+**********************************************************************/
 cd "${dir_data}"
 disp "identifying social care data"
 
 
 /**************************************************************************************
 *	load data
-*************************************************************************************/
+**************************************************************************************/
 global firstWave = 7
 foreach waveid in $scRecWaves {
 	
@@ -134,7 +134,7 @@ qui {
 
 /**************************************************************************************
 *	interpolate missing data
-*************************************************************************************/
+**************************************************************************************/
 
 // identify gaps in data
 disp "filling gaps in data"
@@ -228,7 +228,7 @@ qui {
 
 /**************************************************************************************
 *	merge with main data set
-*************************************************************************************/
+**************************************************************************************/
 disp "merge results with existing data"
 
 qui {
@@ -268,7 +268,7 @@ cap log close
 
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	sample_temp.dta
diff --git a/input/InitialPopulations/compile/04_social_care_provided.do b/input/InitialPopulations/compile/04_social_care_provided.do
index 152984d16..e9c6ead53 100644
--- a/input/InitialPopulations/compile/04_social_care_provided.do
+++ b/input/InitialPopulations/compile/04_social_care_provided.do
@@ -5,7 +5,7 @@
 *	AUTH: Justin van de Ven (JV)
 *	LAST EDIT: Daria Popova 
 *
-*******************************************************************************/
+********************************************************************************/
 
 ***************************************************************************************
 cap log close 
@@ -13,12 +13,12 @@ log using "${dir_log}/04_social_care_provided.log", replace
 ***************************************************************************************
 /********************************************************************************
 	local data directories - commented out when using master program
-*******************************************************************************/
+********************************************************************************/
 
 
 /**********************************************************************
 *	start analysis
-*********************************************************************/
+**********************************************************************/
 cd "${dir_data}"
 disp "identifying social care provision"
 
@@ -66,7 +66,7 @@ save "${dir_data}/ukhls_scprov_pooled0.dta", replace
 
 /**************************************************************************************
 *	process variables
-*************************************************************************************/
+**************************************************************************************/
 use "ukhls_scprov_pooled0.dta", clear
 
 // provision of care
@@ -129,7 +129,7 @@ save "ukhls_scprov_pooled1.dta", replace
 
 /**************************************************************************************
 *	merge with main data set
-*************************************************************************************/
+**************************************************************************************/
 disp "merge results with existing data"
 
 use "UKHLS_pooled_all_obs_03.dta", clear
@@ -146,7 +146,7 @@ save "ukhls_pooled_all_obs_04.dta", replace
 cap log close 
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	int_temp.dta
diff --git a/input/InitialPopulations/compile/05_create_benefit_units.do b/input/InitialPopulations/compile/05_create_benefit_units.do
index 7e4eac830..d05e5b256 100644
--- a/input/InitialPopulations/compile/05_create_benefit_units.do
+++ b/input/InitialPopulations/compile/05_create_benefit_units.do
@@ -17,12 +17,12 @@ log using "${dir_log}/05_drop_hholds_create_benefit_units.log", replace
 ********************************************************************************
 
 use "$dir_data\UKHLS_pooled_all_obs_04.dta", clear 
-/******************************************************************************/
+/*******************************************************************************/
 fre ivfio
 keep if ivfio == 1 | ivfio == 2 | ivfio == 21 | ivfio == 24 
 fre ivfio
 
-/******************************Split households*******************************/
+/******************************Split households********************************/
 *DP: This procedure is revised following the approach taken for the EU-SILC based models  
 /**********************Rules and assumptions***********************************
 1. Each HH can contain: Responsible Male, and/or Responsible Female, Children, Other members.
@@ -501,3 +501,17 @@ drop if stm<0
 save "$dir_data\ukhls_pooled_all_obs_05.dta", replace  
 
 cap log close 
+/**************************************************************************************
+* clean-up and exit
+**************************************************************************************/
+#delimit ;
+local files_to_drop 
+	fatherinfo.dta
+	motherinfo.dta
+	orphans.dta
+	;
+#delimit cr // cr stands for carriage return
+
+foreach file of local files_to_drop { 
+	erase "$dir_data/`file'"
+}
diff --git a/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do b/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do
index e901b07ee..9d5af4122 100644
--- a/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do
+++ b/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do
@@ -13,16 +13,16 @@
 
 ********************************************************************************
 cap log close 
-log using "${dir_log}/05_drop_hholds.log", replace
+log using "${dir_log}/05_drop_hholds_create_benefit_units.log", replace
 ********************************************************************************
 
 use "$dir_data\UKHLS_pooled_all_obs_04.dta", clear 
-/******************************************************************************/
+/*******************************************************************************/
 fre ivfio
 keep if ivfio == 1 | ivfio == 2 | ivfio == 21 | ivfio == 24 
 fre ivfio
 //(88,338 observations deleted) 
-/******************************Split households*******************************/
+/******************************Split households********************************/
 
 *DP: script from "Data management replication file"
 /**********************Rules and assumptions***********************************
@@ -44,7 +44,7 @@ In the simulation everyone starts as "Other member" and is assigned one of the r
 */
 
 
-
+/*
 *Create unique partnership identifier within each household
 /*Cond(x,a,b)
 Description:  a if x is true and nonmissing, b if x is false; a if c is not specified and x evaluates to missing
@@ -131,12 +131,12 @@ gsort +swv +idhh -dag
 by swv idhh: carryforward idfather2, replace
 replace idfather = idfather2 if dag < $age_become_responsible & idmother<0 & idfather<0 & !missing(idfather2)
 
-/**************************Drop remaining orphans *********************************************/
+/**************************Drop remaining orphans **********************************************/
 count if dag < $age_become_responsible & idmother<0 & idfather<0
 /*143 cases in total*/
 bys swv: count if dag < $age_become_responsible & idmother<0 & idfather<0
 drop if dag < $age_become_responsible & idmother<0 & idfather<0
-/**********************************************************************************************/
+/***********************************************************************************************/
 
 
 *Check for same-sex couples
@@ -202,7 +202,7 @@ count if samesex_hh==1
 bys swv: fre samesex_hh
 /* 2,855 hhds in total, aprox 230 -250 in each wave */
 drop if samesex_hh == 1
-/************************************************************************************************************/
+/*************************************************************************************************************/
 
 
 * Clean up
@@ -228,9 +228,9 @@ replace idhome = idhhmother if adultChildFlag == 1 & !missing(idhhmother)
 replace idhome = idhhfather if adultChildFlag == 1 & missing(idhhmother) & !missing(idhhfather)
 */
 
-/**************************************************************************************************************************/
+/***************************************************************************************************************************/
 *DP: script from "UK Compile do-file" - a more recent version of a split 
-/**************************************************************************************************************************/
+/***************************************************************************************************************************/
 * recode same sex couples as singles
 replace idpartner = -9 if (ssscp==1)
 replace dcpst = 2 if (ssscp==1)
@@ -431,3 +431,5 @@ bys stm idhh: egen dropHH = max(dropObs)
 bys stm: tab dropHH, mis
 drop if stm<0
 save "$dir_data\ukhls_pooled_all_obs_05.dta", replace  
+
+cap log close 
diff --git a/input/InitialPopulations/compile/06_reweight_and_slice.do b/input/InitialPopulations/compile/06_reweight_and_slice.do
index 5e860572c..5a4b5295a 100644
--- a/input/InitialPopulations/compile/06_reweight_and_slice.do
+++ b/input/InitialPopulations/compile/06_reweight_and_slice.do
@@ -5,7 +5,7 @@
 *	AUTH: Patryk Bronka, Daria Popova, Justin van de Ven
 *	LAST EDIT: 15 Dec 2025 DP 
 *
-*********************************************************************/
+**********************************************************************/
 ********************************************************************************
 cap log close 
 log using "${dir_log}/06_reweight_and_slice.log", replace
@@ -104,7 +104,7 @@ recode dcpyy dcpagdf ynbcpdf_dv dnc02 dnc ypnbihs_dv yptciihs_dv ypncp ypnoab yp
 save "$dir_data\ukhls_pooled_all_obs_06.dta", replace  
 
 
-/**********************Slice the original pooled dataset into years *******************************************/
+/**********************Slice the original pooled dataset into years ********************************************/
 forvalues yy = $firstSimYear/$lastSimYear {
 
 	use "$dir_data\ukhls_pooled_all_obs_06.dta", clear
@@ -126,7 +126,7 @@ forvalues yy = $firstSimYear/$lastSimYear {
 cap log close
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	temp_adjusted_dwt.dta
diff --git a/input/InitialPopulations/compile/07_was_wealth_data.do b/input/InitialPopulations/compile/07_was_wealth_data.do
index fe1b06504..a02d1733a 100644
--- a/input/InitialPopulations/compile/07_was_wealth_data.do
+++ b/input/InitialPopulations/compile/07_was_wealth_data.do
@@ -568,7 +568,7 @@ sum wealth [fweight=dwt] if (bu_rp), detail
 
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	chk.dta
diff --git a/input/InitialPopulations/compile/08_wealth_to_ukhls.do b/input/InitialPopulations/compile/08_wealth_to_ukhls.do
index 1f2b39971..b3be256e0 100644
--- a/input/InitialPopulations/compile/08_wealth_to_ukhls.do
+++ b/input/InitialPopulations/compile/08_wealth_to_ukhls.do
@@ -7,19 +7,19 @@
 *	AUTH: Justin van de Ven (JV)
 *	LAST EDIT: 01/11/2023 (JV)
 *
-*********************************************************************/
+**********************************************************************/
 
 
 /**********************************************************************
 *	start analysis
-*********************************************************************/
+**********************************************************************/
 cd "${dir_data}"
 disp "imputing wealth data"
 *global yearWealth = 2019
 
 /**********************************************************************
 *	preliminaries
-*********************************************************************/
+**********************************************************************/
 * define seed to ensure replicatability of results
 global seedBase = 3141592
 global seedAdjust = 0
@@ -27,7 +27,7 @@ global seedAdjust = 0
 
 /**********************************************************************
 *	adjust UKHLS data to facilitate imputation
-*********************************************************************/
+**********************************************************************/
 use "population_initial_fs_UK_$yearWealth", clear
 sort idperson
 drop liquid_wealth smp rnk mtc
@@ -35,7 +35,7 @@ drop liquid_wealth smp rnk mtc
 
 /**********************************************************************
 *	align variable definitions
-*********************************************************************/
+**********************************************************************/
 gen dvage17 = 0
 forval ii = 1/16 {
 	
@@ -147,13 +147,13 @@ drop pct1
 
 /**********************************************************************
 *	save working data
-*********************************************************************/
+**********************************************************************/
 save "ukhls_wealthtemp.dta", replace
 
 
 /**********************************************************************
 *	analyse sample
-*********************************************************************/
+**********************************************************************/
 /*
 use "ukhls_wealthtemp.dta", clear
 tab gor2 [fweight=dwt2]
@@ -179,7 +179,7 @@ sum inc [fweight=dwt2] if (chk==0)
 *	matching organised around 3 sets of ranking criteria, where rank 1
 *	criteria are the most fine grained, and rank 3 are the most coarse
 *	grained.
-*********************************************************************/
+**********************************************************************/
 * identify non-reference population and save for retrieval
 use "ukhls_wealthtemp.dta", clear
 gen treat = (single_woman + single_man + couple_ref)
@@ -368,7 +368,7 @@ keep if (treat)
 
 /**********************************************************************
 *	append non-reference population
-*********************************************************************/
+**********************************************************************/
 append using "ukhls_wealthtemp2.dta"
 sort bu
 recode wealthi (mis=0)
@@ -391,7 +391,7 @@ sum liquid_wealth [fweight=dwt2], detail
 
 /**********************************************************************
 *	clean data and save
-*********************************************************************/
+**********************************************************************/
 use ukhls_wealthtemp3, clear
 drop dvage17 year gor gor2 sex nk na dhe2 dhesp2 grad gradsp emp empsp inci inc nk04i nk04 idnk04 dhe2grad dhe2ngrad dlltsdgrad dlltsdngrad empage single_woman single_man couple single ee ee2 was bu couple_ref pct dwt2 treat case person_id p_healths dlltsdsp healths wealth bu_rp tt dhe3 dhe4 dvage07 nk2 nk3 gor3 gor4 pct2 wealthi
 recode rnk smp mtc (missing = -9)
@@ -401,10 +401,10 @@ label var mtc "benefit unit id (bu) of matched observation"
 label var liquid_wealth "total wealth including housing, business and private (personal and occupational) pensions" 
 save "population_initial_fs_UK_$yearWealth", replace
 
-
+/*
 /**************************************************************************************
 * clean-up and exit
-*************************************************************************************/
+**************************************************************************************/
 #delimit ;
 local files_to_drop 
 	ukhls_wealthtemp.dta
diff --git a/input/InitialPopulations/compile/09_finalise_input_data.do b/input/InitialPopulations/compile/09_finalise_input_data.do
index 477110f0d..ff75b82da 100644
--- a/input/InitialPopulations/compile/09_finalise_input_data.do
+++ b/input/InitialPopulations/compile/09_finalise_input_data.do
@@ -6,7 +6,7 @@
 * COUNTRY:              UK
 * DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
 * AUTHORS: 				Daria Popova, Justin van de Ven
-* LAST UPDATE:          15 Dec 2025
+* LAST UPDATE:          30 June 2025 DP 
 * NOTE:					Called from 00_master.do - see master file for further details
 ***************************************************************************************
 
@@ -18,7 +18,7 @@ log using "${dir_log}/09_finalise_input_data.log", replace
 * pool all waves
 ***************************************************************************************
 forvalues year = $firstSimYear/$lastSimYear {
-* load pooled data with missing values removed  
+* load pooled data   
 	
 	if (`year'==$firstSimYear) {
 		use "$dir_data/population_initial_fs_UK_`year'.dta", clear
@@ -180,34 +180,31 @@ forvalues yy = $firstSimYear/$lastSimYear {
 	sum one [w=dwt]
 
 	*limit saved variables
-	keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 ///
+	keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 ///
 	yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 ///
-	stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt ///
-	potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
-        econ_benefits econ_benefits_nonuc econ_benefits_uc ///
-	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress
+	stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt ///
+	potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
+	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp 
 	
-	order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen ///
-	dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag ///
-	multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
-        econ_benefits econ_benefits_nonuc econ_benefits_uc ///
-	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress
+	order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen ///
+	dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag ///
+	multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
+	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp 
 	
-	recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp ///
-	dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 ///
-	adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs ///
-        econ_benefits econ_benefits_nonuc econ_benefits_uc ///
-	formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress (missing=-9)
+	recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp ///
+	dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 ///
+	adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs ///
+	formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp  (missing=-9)
 	
 	gsort idhh idbenefitunit idperson
 	save "$dir_data/population_initial_UK_$year.dta", replace
 	
-	recode dgn liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost aidhrs carewho (-9=0)
+	recode dgn liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost aidhrs carewho (-9=0)
 	export delimited using "$dir_data/population_initial_UK_$year.csv", nolabel replace
 }
 
 cap log close
-***************************************************************************************
+/***************************************************************************************
 * finalise
 ***************************************************************************************
 #delimit ;
@@ -215,12 +212,12 @@ local files_to_drop
 	was_wealthdata.dta
 	;
 #delimit cr // cr stands for carriage return
-/*
+
 foreach file of local files_to_drop { 
 	erase "$dir_data/`file'"
 }
-*/
 
+*/
 ***************************************************************************************
 * end
 ***************************************************************************************
diff --git a/input/InitialPopulations/compile/10_check_yearly_data.do b/input/InitialPopulations/compile/10_check_yearly_data.do
index 4bd3570c7..f1ac358a5 100644
--- a/input/InitialPopulations/compile/10_check_yearly_data.do
+++ b/input/InitialPopulations/compile/10_check_yearly_data.do
@@ -6,24 +6,25 @@
 * COUNTRY:              UK
 * DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
 * AUTHORS: 				Daria Popova
-* LAST UPDATE:          15 Dec 2025 DP 
+* LAST UPDATE:          30 June 2025 DP 
 * NOTE:					Called from 00_master.do - see master file for further details
 ***************************************************************************************/*
-set matsize 15000
+set matsize 11000, permanently
 ********************************************************************************/
 cap log close 
 log using "${dir_log}/10_check_yearly_data.log", replace
 ********************************************************************************
+ 
 
 *all variables 
 #delimit ;
 local varlist 
-idhh                    
-idbenefitunit                 
-idperson                    
-idpartner                    
-idmother                      
-idfather                        
+idhh
+idbenefitunit
+idperson
+idpartner
+idmother
+idfather
 pno                          
 swv                            
 dgn                           
@@ -35,7 +36,8 @@ ded
 deh_c3                        
 sedex                         
 les_c3                        
-dlltsd                         
+dlltsd    
+dlltsd01                    
 dhe                            
 ydses_c5                       
 yplgrs_dv                       
@@ -89,6 +91,7 @@ ypnoab
 dhe_mcs 
 dhe_pcs 
 dot 
+dot01
 unemp  
 ;
 #delimit cr // cr stands for carriage return
@@ -111,22 +114,22 @@ lesdf_c4
 les_c4                     
 lessp_c4          
 drgn1   
-dot     
+dot  
+dot01     
 ;
 #delimit cr // cr stands for carriage return 
 
 
 *new varlist with categorical variables outputted by category 
 #delimit ;
-local varlist2 
-idhh                    
-idbenefitunit                 
-idperson                    
-idpartner                    
-idmother                      
-idfather                        
-pno   
-ppno                        
+local varlist2  
+idhh
+idbenefitunit
+idperson
+idpartner
+idmother
+idfather    
+pno                       
 swv                            
 dgn                           
 dag                            
@@ -135,7 +138,8 @@ dnc02
 dnc                           
 ded
 sedex              
-dlltsd                         
+dlltsd  
+dlltsd01                       
 ypncp                           
 ypnoab         
 yplgrs_dv                       
@@ -233,12 +237,20 @@ daughter_socare_hrs
 son_socare_hrs                   
 other_socare_hrs                 
 formal_socare_cost
-liquid_wealth 
+liquid_wealth
+dhemcs 
+dhepcs 
 dot_1
 dot_2
 dot_3
 dot_4
-dot_5
+dot01_1
+dot01_2
+dot01_3
+dot01_4 
+dot01_5 
+dot01_6 
+unemp
 	;
 #delimit cr // cr stands for carriage return 
 
@@ -290,14 +302,19 @@ save "$dir_data/population_initial_UK_`year'_orig.dta", replace
 outreg2 using "$dir_data/population_initial_UK_orig_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2')
 
 }
-
 */
+
 *******************************************************
 *output summary stats for new initial populations     *
 *******************************************************
-forvalues year=2010/2023 {
+forvalues year=2010/2023 { 
 use "$dir_data/population_initial_UK_`year'.dta", clear  
 
+cap drop dhemcs dhepcs
+clonevar dhemcs=dhe_mcs  
+clonevar dhepcs=dhe_pcs 
+
+
 foreach var of local varlist_cat {
 recode `var' (0=.) (-9=.) 
 cap drop `var'_*
@@ -309,16 +326,14 @@ foreach var of local varlist2 {
 recode `var' (-9=.) 
  }
 
-foreach var in  need_socare  formal_socare_hrs  partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs  formal_socare_cost ///
-liquid_wealth {
+foreach var in  need_socare  formal_socare_hrs  partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs  formal_socare_cost liquid_wealth {
 recode `var' (.=0)
 }
  
- 
 order `varlist2' 
 qui sum `varlist2' , de 
 
-save "$dir_data/population_initial_UK_`year'.dta", replace   
+//save "$dir_data/population_initial_UK_`year'.dta", replace   
 outreg2 using "$dir_data/population_initial_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2')
 }
 
@@ -326,9 +341,10 @@ outreg2 using "$dir_data/population_initial_UK_sumstats.xls" if stm==`year', sum
 **********************************************************************
 *output summary stats for new initial populations before dropping hhs*
 **********************************************************************
-forvalues year=2010/2023 {
+forvalues year=2010/2023 { 
 use "$dir_data/population_initial_fs_UK_`year'.dta", clear  
 
+
 cap gen dwt_sampling =0
 cap gen uk_pop=0                        
 cap gen surv_pop=0                        
@@ -336,6 +352,10 @@ cap gen multiplier=0
 cap gen adult = dag>=$age_become_responsible 
 cap gen child = 1 - adult    
 
+cap drop dhemcs dhepcs
+clonevar dhemcs=dhe_mcs  
+clonevar dhepcs=dhe_pcs 
+
 foreach var of local varlist_cat {
 recode `var' (0=.) (-9=.) 
 cap drop `var'_*
@@ -352,40 +372,49 @@ liquid_wealth {
 recode `var' (.=0)
 }
  
- 
+keep `varlist2' 
 order `varlist2' 
 qui sum `varlist2' , de 
 
-save "$dir_data/population_initial_fs_UK_`year'.dta", replace   
+//save "$dir_data/population_initial_fs_UK_`year'.dta", replace   
 outreg2 using "$dir_data/population_initial_fs_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2')
 }
 
 
+
 cap erase "$dir_data/population_initial_UK_orig_sumstats.txt"
 cap erase "$dir_data/population_initial_UK_sumstats.txt"
 cap erase "$dir_data/population_initial_fs_UK_sumstats.txt"
 
 cap log close            
-   
-  
-
+ 
+/*  
 *************************************************************
 *clean up new initial populations - keep only required vars * 
 *************************************************************
-/*
 forvalues year=2010/2023 {
 insheet using "$dir_data/population_initial_UK_`year'.csv", clear  
 
-keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex les_c3 dlltsd dhe ///
-ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp ///
-lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned scghq2_dv_miss_flag lhw drgn1 dct dwt_sampling les_c4 dhm_ghq ///
-lessp_c4 adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly ///
-liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
- aidhrs carewho ypncp ypnoab dhe_mcs dhe_pcs dot unemp  
-
-
-save "$dir_data/population_initial_UK_`year'.dta", replace
-outsheet using "$dir_data/population_initial_UK_`year'.csv", nolabel replace
+	*limit saved variables
+	keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 ///
+	yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 ///
+	stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt ///
+	potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
+	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp 
+	
+	order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen ///
+	dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag ///
+	multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost ///
+	ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp 
+	
+	recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp ///
+	dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 ///
+	adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs ///
+	formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp  (missing=-9)
+	
+	gsort idhh idbenefitunit idperson
+	save "$dir_data/population_initial_UK_`year'.dta", replace
+	export delimited using "$dir_data/population_initial_UK_`year'.csv", nolabel replace
 }
 */
 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_education.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_education.do
new file mode 100644
index 000000000..350e42815
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_education.do
@@ -0,0 +1,832 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Education
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova  
+* LAST UPDATE:		May 2025
+* COUNTRY: 			UK   
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					3 education processes estimated. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_education.do"
+********************************************************************************
+
+*******************************************************
+* E1a: Probability of Leaving Initial Education Spell *
+*******************************************************
+
+* Year 
+use "$dir_validation_data/E1a_sample", clear
+
+// construct multiple versions of the predicted outcome allowing for different 
+// random draws 
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_ded`i' = 0 
+	replace pred_ded`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+// for each iteration calculate the share that leave edu 
+collapse (mean) ded pred_ded* [aw = dwt], by(stm)
+
+order pred_ded*
+
+// take the average across datasets 
+egen pred_ded = rowmean(pred_ded0-pred_ded19)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_ded stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line ded stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Continues in Initial Education Spell") xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16-29" "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E1a_continues_edu_ts_16_29_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+
+
+* Gender 
+preserve
+collapse (mean) ded pred_ded* [aw = dwt], by(dgn stm)
+
+order pred_ded*
+
+egen pred_ded = rowmean(pred_ded0-pred_ded19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_ded stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+
+twoway ///
+(line pred_ded stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+grc1leg graph1 graph2 ,  ///
+	title("Continues in Initial Education Spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16-29. Initial education" "spell defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/education/int_validation_E1a_continues_edu_ts_16_29_gender.png", ///, ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all  
+
+restore
+ 
+ 
+* Age
+preserve
+
+collapse (mean) ded pred_ded* [aw = dwt], by(dag)
+
+order pred_ded*
+
+egen pred_ded = rowmean(pred_ded0-pred_ded19)
+
+twoway ///
+(line pred_ded dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line ded dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Continues in Initial Education Spell") subtitle("Share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16-29." "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E1a_continues_edu_share_age.png", ///
+     as(png) replace width(2560) height(1440) //quality(100)
+	
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) ded pred_ded* [aw = dwt], by(ydses_c5 stm)
+
+order pred_ded*
+
+egen pred_ded = rowmean(pred_ded0-pred_ded19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_ded stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Continues in Initial Education Spell") ///
+	subtitle("By hh dispoable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16-29. Initial education" "spell defined generously.", size(vsmall))
+
+graph export ///
+"$dir_validation_graphs/education/int_validation_E1a_continues_edu_ts_16_29_both_income.png", ///	
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Marital status 
+preserve
+
+collapse (mean) ded pred_ded* [aw = dwt], by(dcpst stm)
+
+order pred_ded*
+
+egen pred_ded = rowmean(pred_ded0-pred_ded19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_ded stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_ded stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line ded stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Continues in Initial Education Spell") ///
+	subtitle("By partnership status") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16-29. Initial education" "spell defined generously.", size(vsmall))	
+	
+graph export ///
+"$dir_validation_graphs/education/int_validation_E1a_continues_edu_ts_16_29_both_partnership.png", ///	
+	as(png) replace width(2560) height(1440) //quality(100)
+
+		
+graph drop _all 	
+	
+restore
+
+
+**********************************************
+* E1b: Probability of Returning to Education *
+**********************************************
+
+* Year
+use "$dir_validation_data/E1b_sample", clear
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_der`i' = 0 
+	replace pred_der`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) der pred_der* [aw = dwt], by(stm)
+
+order pred_der*
+
+egen pred_der = rowmean(pred_der0-pred_der19)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_der stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line der stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Returns to Education") xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are not observed in their initial education spell in their previous" "observation, aged 16-35. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E1b_returns_edu_ts_16_35_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+   
+restore  
+
+
+* Gender 
+preserve
+
+collapse (mean) der pred_der* [aw = dwt], by(dgn stm)
+
+order pred_der*
+
+egen pred_der = rowmean(pred_der0-pred_der19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_der stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2, ///
+	title("Returns to education") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are not observed in their initial education spell in their previous" "observation, aged 16-35. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E1b_returns_edu_ts_16_35_gender.png", ///	
+as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all  
+
+restore
+ 
+ 
+* Age
+preserve
+
+collapse (mean) der pred_der* [aw = dwt], by(dag)
+
+order pred_der*
+
+egen pred_der = rowmean(pred_der0-pred_der19)
+
+twoway ///
+(line pred_der dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line der dag, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+title("Returns to Education") subtitle("Share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are not observed in their initial education spell in their previous" "observation, aged 16-35. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E1b_returns_edu_share_age.png", ///	
+	as(png) replace width(2560) height(1440) //quality(100)	
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) der pred_der* [aw = dwt], by(ydses_c5 stm)
+
+order pred_der*
+
+egen pred_der = rowmean(pred_der0-pred_der19)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_der stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5, ///
+	title("Returns to Education") ///
+	subtitle("By hh disposable income") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are not observed in their initial education spell in their previous observation," "aged 16-35. Initial education spell defined generously.", size(vsmall))
+
+graph export ///
+"$dir_validation_graphs/education/int_validation_E1b_returns_edu_ts_16_35_both_income.png", ///	
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all 	
+	
+restore
+
+
+* Marital status 
+preserve
+
+collapse (mean) der pred_der* [aw = dwt], by(dcpst stm)
+
+order pred_der*
+
+egen pred_der = rowmean(pred_der0-pred_der19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_der stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_der stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line der stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3, ///
+	title("Returns to Education") ///
+	subtitle("By partnership status") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are not observed in their initial education spell in their previous observation," "aged 16-35.Initial education spell defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/education/int_validation_E1b_returns_edu_ts_16_35_both_partnership.png", ///	
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+*************************************************
+* E2a Educational Level After Leaving Education *
+*************************************************
+
+* Overall 
+use "$dir_validation_data/E2a_sample", clear
+
+sum p1-p3 // inspect negative values  
+
+gen p1p2 = p1 + p2 // create cdf
+
+gen rnd = runiform()
+gen edu_pred = cond((rnd < p1), 1, cond(rnd < p1p2, 2, 3)) 
+
+keep if in_sample == 1 
+
+
+twoway (histogram edu_pred if in_sample == 1, color(green)) ///
+	(histogram deh_c3_recoded if in_sample == 1, color(none) lcolor(black)), ///
+	xtitle (Education level) /// 
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Educational Attainment when Leave Initial Education Spell") ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed leaving their initial education spell in the" "current observation, aged 16-29. Initial education spell defined generously. 1 = Low education, 2 = Medium education," "3 = High education.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E2a_edu_attainment_hist_16_29_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+		
+
+* Year 
+use "$dir_validation_data/E2a_sample", clear
+
+sum p1-p3 
+
+gen p1p2 = p1 + p2 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen edu_pred`i' = cond((rnd < p1), 1, cond(rnd < p1p2, 2, 3)) 
+	gen pred_edu_low`i' = (edu_pred`i' == 1)
+	gen pred_edu_med`i' = (edu_pred`i' == 2)
+	gen pred_edu_high`i' = (edu_pred`i' == 3)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+gen edu_low = (deh_c3_recoded == 1)
+gen edu_med = (deh_c3_recoded == 2)
+gen edu_high = (deh_c3_recoded == 3)
+
+preserve 
+
+collapse (mean) edu_low edu_med edu_high pred_edu_low* pred_edu_med* ///
+	pred_edu_high* [aw = dwt], by(stm)
+
+order pred_edu_low* pred_edu_med* pred_edu_high*	
+	
+egen pred_edu_low = rowmean(pred_edu_low0-pred_edu_low19)
+egen pred_edu_med = rowmean(pred_edu_med0-pred_edu_med19)
+egen pred_edu_high = rowmean(pred_edu_high0-pred_edu_high19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_edu_low stm, sort color(red) legend(label(1 "Low education, predicted"))) ///
+(line edu_low stm, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "Low education, observed"))) ///
+(line pred_edu_med stm, sort color(blue) legend(label(3 "Medium education, predicted"))) ///
+(line edu_med stm, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "Medium education, observed"))) ///
+(line pred_edu_high stm, sort color(green) legend(label(5 "High education, predicted"))) ///
+(line edu_high stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "High education, observed"))) , ///
+ title("Educational Attainment when Leave Initial Education Spell") ///
+	subtitle("Ages 16-29" ) ///
+	xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed leaving their initial education spell in the current" "observation, aged 16-29. Initial education spell defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/education/int_validation_E2a_edu_attainment_ts_16_29_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+
+graph drop _all 	
+	
+restore
+
+
+* Gender 
+preserve 
+
+collapse (mean) edu_low edu_med edu_high pred_edu_low* pred_edu_med* ///
+	pred_edu_high* [aw = dwt], by(stm dgn)
+
+order pred_edu_low* pred_edu_med* pred_edu_high*		
+	
+egen pred_edu_low = rowmean(pred_edu_low0-pred_edu_low19)
+egen pred_edu_med = rowmean(pred_edu_med0-pred_edu_med19)
+egen pred_edu_high = rowmean(pred_edu_high0-pred_edu_high19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_edu_low stm if dgn == 0, sort color(red) legend(label(1 "Low education, predicted"))) ///
+(line edu_low stm if dgn == 0, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "Low education, observed"))) ///
+(line pred_edu_med stm if dgn == 0, sort color(blue) legend(label(3 "Medium education, predicted"))) ///
+(line edu_med stm if dgn == 0, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "Medium education, observed"))) ///
+(line pred_edu_high stm if dgn == 0, sort color(green) legend(label(5 "High education, predicted"))) ///
+(line edu_high stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "High education, observed"))) , ///
+	name(edu_attainment_female, replace) ///
+ title("Females") ///
+	xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+	
+twoway ///
+(line pred_edu_low stm if dgn == 1, sort color(red) legend(label(1 "Low education, predicted"))) ///
+(line edu_low stm if dgn == 1, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "Low education, observed"))) ///
+(line pred_edu_med stm if dgn == 1, sort color(blue) legend(label(3 "Medium education, predicted"))) ///
+(line edu_med stm if dgn == 1, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "Medium education, observed"))) ///
+(line pred_edu_high stm if dgn == 1, sort color(green) legend(label(5 "High education, predicted"))) ///
+(line edu_high stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "High education, observed"))) , ///
+	name(edu_attainment_male, replace) ///
+ title("Males") ///
+	xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 	
+	
+grc1leg edu_attainment_female edu_attainment_male, ///
+	title("Educational Attainment when Leave Initial Education Spell") ///
+	legendfrom(edu_attainment_male) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed leaving their initial education spell in the current" "observation, aged 16-29. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E2a_edu_attainment_ts_16_29_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all 	
+	
+restore
+	
+	
+* Age
+preserve 
+
+collapse (mean) edu_low edu_med edu_high pred_edu_low* pred_edu_med* ///
+	pred_edu_high* [aw = dwt], by(dag)
+
+order pred_edu_low* pred_edu_med* pred_edu_high*		
+	
+egen pred_edu_low = rowmean(pred_edu_low0-pred_edu_low19)
+egen pred_edu_med = rowmean(pred_edu_med0-pred_edu_med19)
+egen pred_edu_high = rowmean(pred_edu_high0-pred_edu_high19)
+
+
+twoway ///
+(line pred_edu_low dag, sort color(red) ///
+	legend(label(1 "Low education, predicted"))) ///
+(line edu_low dag, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "Low education, observed"))) ///
+(line pred_edu_med dag, sort color(blue) ///
+	legend(label(3 "Medium education, predicted"))) ///
+(line edu_med dag, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "Medium education, observed"))) ///
+(line pred_edu_high dag, sort color(green) ///
+	legend(label(5 "High education, predicted"))) ///
+(line edu_high dag, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "High education, observed"))), ///
+	title("Educational Attainment when Leave Initial Education Spell") ///
+	subtitle("By age") ///
+	xtitle("Age") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed leaving their initial education spell in the current" "observation, aged 16-29. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/education/int_validation_E2a_edu_attainment_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Income  
+preserve 
+
+collapse (mean) edu_low edu_med edu_high pred_edu_low* pred_edu_med* ///
+	pred_edu_high* [aw = dwt], by(stm ydses_c5)
+
+order pred_edu_low* pred_edu_med* pred_edu_high*	
+	
+egen pred_edu_low = rowmean(pred_edu_low0-pred_edu_low19)
+egen pred_edu_med = rowmean(pred_edu_med0-pred_edu_med19)
+egen pred_edu_high = rowmean(pred_edu_high0-pred_edu_high19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_edu_low stm if ydses_c5 == 1, sort color(red) ///
+	legend(label(1 "Low education, predicted"))) ///
+(line edu_low stm if ydses_c5 == 1, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "Low education, observed"))) ///
+(line pred_edu_med stm if ydses_c5 == 1, sort color(blue) ///
+	legend(label(3 "Medium education, predictedd"))) ///
+(line edu_med stm if ydses_c5 == 1, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "Medium education, observed")))	///
+(line pred_edu_high stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(5 "High education, predicted"))) ///
+(line edu_high stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "HIgh education, observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 	
+
+twoway ///
+(line pred_edu_low stm if ydses_c5 == 2, sort color(red) ///
+	legend(label(1 "L Pred"))) ///
+(line edu_low stm if ydses_c5 == 2, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "L Obs"))) ///
+(line pred_edu_med stm if ydses_c5 == 2, sort color(blue) ///
+	legend(label(3 "M Pred"))) ///
+(line edu_med stm if ydses_c5 == 2, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "M Obs")))	///
+(line pred_edu_high stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(5 "H Pred"))) ///
+(line edu_high stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "H Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+twoway ///
+(line pred_edu_low stm if ydses_c5 == 3, sort color(red) ///
+	legend(label(1 "L Pred"))) ///
+(line edu_low stm if ydses_c5 == 3, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "L Obs"))) ///
+(line pred_edu_med stm if ydses_c5 == 3, sort color(blue) ///
+	legend(label(3 "M Pred"))) ///
+(line edu_med stm if ydses_c5 == 3, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "M Obs")))	///
+(line pred_edu_high stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(5 "H Pred"))) ///
+(line edu_high stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "H Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+twoway ///
+(line pred_edu_low stm if ydses_c5 == 4, sort color(red) ///
+	legend(label(1 "L Pred"))) ///
+(line edu_low stm if ydses_c5 == 4, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "L Obs"))) ///
+(line pred_edu_med stm if ydses_c5 == 4, sort color(blue) ///
+	legend(label(3 "M Pred"))) ///
+(line edu_med stm if ydses_c5 == 4, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "M Obs")))	///
+(line pred_edu_high stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(5 "H Pred"))) ///
+(line edu_high stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "H Obs"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+	
+twoway ///
+(line pred_edu_low stm if ydses_c5 == 5, sort color(red) ///
+	legend(label(1 "L Pred"))) ///
+(line edu_low stm if ydses_c5 == 5, sort color(red) color(red%20) ///
+	lpattern(dash) legend(label(2 "L Obs"))) ///
+(line pred_edu_med stm if ydses_c5 == 5, sort color(blue) ///
+	legend(label(3 "M Pred"))) ///
+(line edu_med stm if ydses_c5 == 5, sort color(blue) color(blue%20) ///
+	lpattern(dash) legend(label(4 "M Obs")))	///
+(line pred_edu_high stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(5 "H Pred"))) ///
+(line edu_high stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(6 "H Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+grc1leg graph1 graph2 graph3 graph4 graph5 , ///
+	title("Educational Attainment when Leave Initial Education Spell") ///
+	subtitle("By hh disposable income") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed leaving their initial education spell in the current" "observation, aged 16-29. Initial education spell defined generously.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/education/int_validation_E2a_edu_attainment_ts_16_29_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all 	
+	
+restore 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_fertility.do
new file mode 100644
index 000000000..d02efd72b
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_fertility.do
@@ -0,0 +1,481 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Fertility
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova  
+* LAST UPDATE:		July 2025
+* COUNTRY: 			UK
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					2 fertility processes estimated. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_fertility.do"
+********************************************************************************
+
+**********************************************
+* F1a - Having a child, in initial edu spell * 
+**********************************************
+
+* Overall 
+use "$dir_validation_data/F1a_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dchpd = 0 
+replace pred_dchpd = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dchpd, color(red)) ///
+	(histogram dchpd, color(none) lcolor(black)), ///
+	xtitle (Had child) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Fertility in initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating a female has a new born child. Estimation sample plotted. Sample contains females" "who are in their  initial education spell and fertile (18-30). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1a_fertility_init_edu_hist_18_30.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+* Year 
+use "$dir_validation_data/F1a_sample", clear
+
+// construct multiple versions of the predicted outcome allowing for different 
+// random draws 
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dchpd`i' = 0 
+	replace pred_dchpd`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+// for each iteration calculate the share that leave edu 
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(stm)
+
+order pred_dchpd*
+
+// take the average across datasets 
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dchpd stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Fertility in initial education spell") xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who are in their" "initial education spell and fertile (18-30). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1a_fertility_init_edu_ts_18_30.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(dag)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+twoway ///
+(line pred_dchpd dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dchpd dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Fertility in initial education spell") ///
+	subtitle("Share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who are in their" "initial education spell and fertile (18-30). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1a_fertility_init_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dchpd stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+ 
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+ 
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Fertility in initial education spell") ///
+	subtitle("By hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who are in their" "initial education spell and fertile (18-30). Initial education spell defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1a_fertility_init_edu_ts_18_30_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all 	
+	
+restore
+
+
+************************************************
+* F1b - Having a child, left initial edu spell *
+************************************************
+
+* Overall 
+use "$dir_validation_data/F1b_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dchpd = 0 
+replace pred_dchpd = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dchpd, color(red)) ///
+	(histogram dchpd, color(none) lcolor(black)), ///
+	xtitle (Had child) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Fertility") ///
+	subtitle("Left initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating a female has a new born child. Estimation sample plotted. Sample"" contains females who have left their initial education spell and are in their fertile years (18-45). Initial education spell defined" "generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_hist_18_45.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+
+	
+* Year 
+use "$dir_validation_data/F1b_sample", clear
+
+// construct multiple versions of the predicted outcome allowing for different 
+// random draws 
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dchpd`i' = 0 
+	replace pred_dchpd`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(stm)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dchpd stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Fertility") ///
+	subtitle("Left initial education spell") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who have left their" "initial education spell and are in their fertile years (18-45). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_ts_18_45.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(dag)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+twoway ///
+(line pred_dchpd dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dchpd dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Fertility ") ///
+	subtitle("Left initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who have left their initial" "education spell and are in their fertile years (18-45). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dchpd stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Fertility") ///
+	subtitle("Left initial education spell, by hh dispoable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who have left their initial education" "spell and are in their fertile years (18-45). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_ts_18_45_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Education
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(deh_c3 stm)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dchpd stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+	title("Fertility") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted. Sample contains females who have left their initial education" "spell and are in their fertile years (18-45). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_ts_18_45_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Marital status 
+preserve
+
+collapse (mean) dchpd pred_dchpd* [aw = dwt], by(dcpst stm)
+
+order pred_dchpd*
+
+egen pred_dchpd = rowmean(pred_dchpd0-pred_dchpd19)
+
+// replace stm= 2000 + stm 
+
+twoway ///
+(line pred_dchpd stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dchpd stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dchpd stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dchpd stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+	title("Fertility ") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Share of females that have a new born child. Estimation sample plotted.Sample contains females who have left their initial education" "spell and are in their fertile years (18-45). Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/fertility/int_validation_${country}_F1b_fertility_left_edu_ts_18_45_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+graph drop _all 	
+	
+restore
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_health.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_health.do
new file mode 100644
index 000000000..b43faca06
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_health.do
@@ -0,0 +1,883 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Health 
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova 
+* LAST UPDATE:		July 2025
+* COUNTRY: 			UK   
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					3 health processes estimated. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_health.do"
+********************************************************************************
+
+********************************************
+* H1a: Health status, in initial edu spell *
+********************************************
+
+* Overall 
+use "$dir_validation_data/H1a_sample", clear
+
+sum p1-p5 // inspect negative values 
+		
+gen p1p2 = p1 + p2 
+gen p1p2p3 = p1p2 + p3
+gen p1p2p3p4 = p1p2p3 + p4 // generate cumulative probabilities for all options
+
+gen rnd = runiform()
+gen pred_health = cond((rnd < p1), 1, cond(rnd < p1p2, 2, ///
+	cond(rnd < p1p2p3, 3, cond(rnd < p1p2p3p4, 4, 5))))
+
+keep if in_sample == 1	
+	
+twoway (histogram pred_health if in_sample == 1, color(red)) ///
+	(histogram dhe if in_sample == 1, color(none) lcolor(black) ), ///
+	xtitle (Self-rated health) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Health Status") ///
+	subtitle("In initial education spell ") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of self-reported health status. Estimation sample plotted. Sample contains individuals" "who are in their initial education spell and aged 18-29. Initial education spell defined generously. 1 = Poor, 5 = Excellent.  ", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1a_health_init_edu_hist_18_29.png", ///
+	as(png) replace	width(2560) height(1440) //quality(100)
+	
+	
+* Year 
+use "$dir_validation_data/H1a_sample", clear
+
+sum p1-p5 // inspect negative values 
+		
+gen p1p2 = p1 + p2 
+gen p1p2p3 = p1p2 + p3
+gen p1p2p3p4 = p1p2p3 + p4 // generate cumulative probabilities for all options
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_health`i' = cond((rnd < p1), 1, cond(rnd < p1p2, 2, ///
+		cond(rnd < p1p2p3, 3, cond(rnd < p1p2p3p4, 4, 5))))
+	gen pred_health_poor`i' = (pred_health`i' == 1)
+	gen pred_health_fair`i' = (pred_health`i' == 2)
+	gen pred_health_good`i' = (pred_health`i' == 3)
+	gen pred_health_vgood`i' = (pred_health`i' == 4)
+	gen pred_health_excel`i' = (pred_health`i' == 5)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+gen health_poor = (dhe == 1)
+gen health_fair = (dhe == 2)
+gen health_good = (dhe == 3)
+gen health_vgood = (dhe == 4)
+gen health_excel = (dhe == 5)
+
+preserve 
+
+collapse (mean) health_* pred_health_*  [aw = dwt], by(stm)
+
+order pred_health_poor* pred_health_fair* pred_health_good* ///
+	pred_health_vgood* pred_health_excel*
+
+egen pred_health_poor = rowmean(pred_health_poor0-pred_health_poor19)
+egen pred_health_fair = rowmean(pred_health_fair0-pred_health_fair19)
+egen pred_health_good = rowmean(pred_health_good0-pred_health_good19)
+egen pred_health_vgood = rowmean(pred_health_vgood0-pred_health_vgood19)
+egen pred_health_excel = rowmean(pred_health_excel0-pred_health_excel19)
+
+replace stm = 2000+stm 
+
+twoway ///
+(line pred_health_poor stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line health_poor stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor ") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_fair stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair ") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_good stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good ") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_vgood stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_excel stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("In initial education spell ") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are in their initial education spell (18-29). Initial education spell" "defined generously.", size(vsmall))
+		
+graph export "$dir_validation_graphs/health/int_validation_H1a_health_init_edu_ts_18_29_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+graph drop _all 	
+	
+restore	
+
+* Gender
+preserve 
+	
+collapse (mean) health_* pred_health_*  [aw = dwt], by(stm dgn)
+	
+order pred_health_poor* pred_health_fair* pred_health_good* ///
+	pred_health_vgood* pred_health_excel*
+
+egen pred_health_poor = rowmean(pred_health_poor0-pred_health_poor19)
+egen pred_health_fair = rowmean(pred_health_fair0-pred_health_fair19)
+egen pred_health_good = rowmean(pred_health_good0-pred_health_good19)
+egen pred_health_vgood = rowmean(pred_health_vgood0-pred_health_vgood19)
+egen pred_health_excel = rowmean(pred_health_excel0-pred_health_excel19)
+
+replace stm = 2000 + stm 
+
+
+* Females 
+twoway ///
+(line pred_health_poor stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line health_poor stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_fair stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_good stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_vgood stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_excel stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent ") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("In initial education spell, females") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are in their initial education spell (18-29). Initial education spell" "defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1a_health_init_edu_ts_18_29_female.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 		
+
+twoway ///
+(line pred_health_poor stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line health_poor stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_fair stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_good stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_vgood stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_excel stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("In initial education spell, males ") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are in their initial education spell (18-29). Initial education spell" "defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1a_health_init_edu_ts_18_29_male.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 		
+	
+restore
+
+
+**********************************************
+* H1b: Health status, left initial edu spell *
+**********************************************
+
+* Overall 
+use "$dir_validation_data/H1b_sample", clear
+
+sum p1-p5 // inspect negative values 
+		
+gen p1p2 = p1 + p2 
+gen p1p2p3 = p1p2 + p3
+gen p1p2p3p4 = p1p2p3 + p4 // generate cumulative probabilities for all options
+
+gen rnd = runiform()
+gen pred_health = cond((rnd < p1), 1, cond(rnd < p1p2, 2, ///
+	cond(rnd < p1p2p3, 3, cond(rnd < p1p2p3p4, 4, 5))))
+
+keep if in_sample == 1 	
+	
+twoway (histogram pred_health if in_sample == 1, color(red)) ///
+	(histogram dhe if in_sample == 1, color(none) lcolor(black)), ///
+	xtitle (Self-rated health) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Health Status") ///
+	subtitle("Left initial education spell ") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of self-reported health status. Estimation sample plotted. Sample contains individual" "who have left their initial education spell and aged 18+. Initial education spell defined generously. 1 = Poor, 5 = Excellent.  ", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1a_health_left_edu_hist_all.png", ///
+	as(png) replace	width(2560) height(1440) //quality(100)	
+	
+	
+* Year 
+use "$dir_validation_data/H1b_sample", clear
+
+sum p1-p5 // inspect negative values 
+		
+gen p1p2 = p1 + p2 
+gen p1p2p3 = p1p2 + p3
+gen p1p2p3p4 = p1p2p3 + p4 // generate cumulative probabilities for all options
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_health`i' = cond((rnd < p1), 1, cond(rnd < p1p2, 2, ///
+		cond(rnd < p1p2p3, 3, cond(rnd < p1p2p3p4, 4, 5))))
+	gen pred_health_poor`i' = (pred_health`i' == 1)
+	gen pred_health_fair`i' = (pred_health`i' == 2)
+	gen pred_health_good`i' = (pred_health`i' == 3)
+	gen pred_health_vgood`i' = (pred_health`i' == 4)
+	gen pred_health_excel`i' = (pred_health`i' == 5)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+gen health_poor = (dhe == 1)
+gen health_fair = (dhe == 2)
+gen health_good = (dhe == 3)
+gen health_vgood = (dhe == 4)
+gen health_excel = (dhe == 5)
+
+preserve 
+
+collapse (mean) health_* pred_health_*  [aw = dwt], by(stm)
+
+order pred_health_poor* pred_health_fair* pred_health_good* ///
+	pred_health_vgood* pred_health_excel*
+
+egen pred_health_poor = rowmean(pred_health_poor0-pred_health_poor19)
+egen pred_health_fair = rowmean(pred_health_fair0-pred_health_fair19)
+egen pred_health_good = rowmean(pred_health_good0-pred_health_good19)
+egen pred_health_vgood = rowmean(pred_health_vgood0-pred_health_vgood19)
+egen pred_health_excel = rowmean(pred_health_excel0-pred_health_excel19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_health_poor stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line health_poor stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_fair stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_good stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_vgood stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm, sort color(green) legend(label(1 "Pred"))) ///
+(line health_excel stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("Left initial education spell ") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell" "defined generously.", size(vsmall))
+		
+graph export "$dir_validation_graphs/health/int_validation_H1b_health_left_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore	
+
+
+* Gender
+preserve 
+	
+collapse (mean) health_* pred_health_*  [aw = dwt], by(stm dgn)
+	
+order pred_health_poor* pred_health_fair* pred_health_good* ///
+	pred_health_vgood* pred_health_excel*
+
+egen pred_health_poor = rowmean(pred_health_poor0-pred_health_poor19)
+egen pred_health_fair = rowmean(pred_health_fair0-pred_health_fair19)
+egen pred_health_good = rowmean(pred_health_good0-pred_health_good19)
+egen pred_health_vgood = rowmean(pred_health_vgood0-pred_health_vgood19)
+egen pred_health_excel = rowmean(pred_health_excel0-pred_health_excel19)
+
+replace stm = 2000 + stm 
+
+* Female
+twoway ///
+(line pred_health_poor stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line health_poor stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_fair stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_good stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_vgood stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_excel stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("Left initial education spell, females ") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell" "defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1b_health_left_edu_ts_all_female.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+
+twoway ///
+(line pred_health_poor stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line health_poor stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Poor ") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_fair stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_fair stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Fair ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_good stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_good stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_vgood stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_vgood stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Very good ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_health_excel stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line health_excel stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Excellent ") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Health Status") ///
+	subtitle("Left initial education spell, males ") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell" "defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H1b_health_left_edu_ts_all_male.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+***********************************************************
+* H2b: Long-term sick or disabled, left initial edu spell *
+***********************************************************
+
+* Overall 
+use "$dir_validation_data/H2b_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dlltsd01 = 0 
+replace pred_dlltsd01 = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dlltsd01, color(red)) ///
+	(histogram dlltsd01, color(none) lcolor(black)), ///
+	xtitle (Disabled/long-term sick ) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of disability/long-term sick dummy. Estimation sample plotted. Sample contains individuals" "who have left their initial education spell and aged 18+. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+
+* Year 
+use "$dir_validation_data/H2b_sample", clear
+
+// construct multiple versions of the predicted outcome allowing for different 
+// random draws 
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dlltsd01`i' = 0 
+	replace pred_dlltsd01`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+// for each iteration calculate the share that leave edu 
+collapse (mean) dlltsd01 pred_dlltsd01* [aw = dwt], by(stm)
+
+order pred_dlltsd01*
+
+// take the average across datasets 
+egen pred_dlltsd01 = rowmean(pred_dlltsd010-pred_dlltsd0119)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlltsd01 stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dlltsd01 stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell") ///
+	xtitle("Year") ytitle("Share")  ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+graph drop _all 
+ 
+restore  
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dlltsd01 pred_dlltsd01* [aw = dwt], by(dag)
+
+order pred_dlltsd01*
+
+egen pred_dlltsd01 = rowmean(pred_dlltsd010-pred_dlltsd0119)
+
+twoway ///
+(line pred_dlltsd01 dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dlltsd01 dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share")  ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell " "defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dlltsd01 pred_dlltsd01* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dlltsd01*
+
+egen pred_dlltsd01 = rowmean(pred_dlltsd010-pred_dlltsd0119)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlltsd01 stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlltsd01 stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell defined" "generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+
+graph drop _all 	
+	
+restore
+
+
+* Education
+preserve
+
+collapse (mean) dlltsd01 pred_dlltsd01* [aw = dwt], by(deh_c3 stm)
+
+order pred_dlltsd01*
+
+egen pred_dlltsd01 = rowmean(pred_dlltsd010-pred_dlltsd0119)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlltsd01 stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlltsd01 stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), /// 
+name(graph1) title("High education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell defined" "generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_ts_all_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all 	
+	
+restore
+
+
+* Marital status 
+preserve
+
+collapse (mean) dlltsd01 pred_dlltsd01* [aw = dwt], by(dcpst stm)
+
+order pred_dlltsd01*
+
+egen pred_dlltsd01 = rowmean(pred_dlltsd010-pred_dlltsd0119)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlltsd01 stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlltsd01 stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlltsd01 stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlltsd01 stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+	title("Disability/long-term sick") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell and aged 18+. Initial education spell defined" "generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/health/int_validation_H2b_disablilty_left_edu_ts_all_both_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+
+graph drop _all 	
+	
+restore
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_home_ownership.do
new file mode 100644
index 000000000..b165bd795
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_home_ownership.do
@@ -0,0 +1,346 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Home ownership
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova 
+* LAST UPDATE:		July 2025 
+* COUNTRY: 			UK 
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					home ownership process estimated. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_home_ownership.do"
+********************************************************************************
+
+************************
+* HO1a: Home ownership *
+************************
+
+* Overall 
+use "$dir_validation_data/HO1a_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dhh_owned = 0 
+replace pred_dhh_owned = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dhh_owned, color(red)) ///
+	(histogram dhh_owned, color(none) lcolor(black) ), ///
+	xtitle (Home ownership) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Home Ownership")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating homeownership. Estimation sample plotted. Sample contains all individuals 18+" "years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+* Year 
+use "$dir_validation_data/HO1a_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dhh_owned`i' = 0 
+	replace pred_dhh_owned`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(stm)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dhh_owned stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dhh_owned stm, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+title("Home Ownership")  ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100) 
+ 
+graph drop _all  
+ 
+restore  
+
+
+* Gender 
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(dgn stm)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dhh_owned stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dhh_owned stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dhh_owned stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2,  ///
+	title("Home Ownership")  ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all  
+
+restore 
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(dag)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+
+twoway ///
+(line pred_dhh_owned dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dhh_owned dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Home Ownership")  ///
+	subtitle("Share by age") ///
+	xtitle("Age") ytitle("Share") xlabel(, labsize(small)) ///
+	ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+restore
+
+* Income 
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dhh_owned stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dhh_owned stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+	name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Home Ownership")  ///
+	subtitle("By hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Education 
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(deh_c3 stm)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dhh_owned stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dhh_owned stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+	title("Home Ownership")  ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_ts_all_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+* Partnership status 
+preserve
+
+collapse (mean) dhh_owned pred_dhh_owned* [aw = dwt], by(dcpst stm)
+
+order pred_dhh_owned*
+
+egen pred_dhh_owned = rowmean(pred_dhh_owned0-pred_dhh_owned19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dhh_owned stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dhh_owned stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dhh_owned stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dhh_owned stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+	title("Home Ownership")  ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains all individuals 18+ years old.", size(vsmall))
+
+graph export "$dir_validation_graphs/home_ownership/int_validation_HO1a_homeownership_ts_all_both_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_income.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_income.do
new file mode 100644
index 000000000..f398b5d83
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_income.do
@@ -0,0 +1,1184 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Income
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, daria Popova 
+* LAST UPDATE:		July May 2025
+* COUNTRY: 			UK   
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					hurdle models used for the income processes. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_income.do"
+********************************************************************************
+
+* I3a selection - capital income, in initial education spell
+
+use "$dir_validation_data/I3a_selection_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_receives_ypncp`i' = 0 
+	replace pred_receives_ypncp`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+replace stm = 2000 + stm 
+egen pred_receives_ypncp = rowmean(pred_receives_ypncp0-pred_receives_ypncp19)
+
+* Raw prediction vs observed
+twoway ///
+	(histogram pred_receives_ypncp0, color(red)) ///
+	(histogram receives_ypncp, color(none) lcolor(black)), ///
+	xtitle (Receives capital income) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Receives Capital Income") ///
+	subtitle("In initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating capital income is recieved. Estimation sample plotted. Sample contains all" "individual age 16+, who are in their initial education spell. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+	
+* Year 
+preserve
+
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(stm)
+
+twoway ///
+(line pred_receives_ypncp stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives Captial Income") ///
+	subtitle("In initial education spell") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+restore	
+	
+graph drop _all	
+	
+
+* By gender 	
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(stm dgn)
+	
+twoway ///
+(line pred_receives_ypncp stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+twoway ///
+(line pred_receives_ypncp stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+grc1leg graph1 graph2 ,  ///
+	title("Receives Captial Income ") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore 
+
+graph drop _all 
+	
+	
+* Share by age 
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(dag)
+
+twoway ///
+(line pred_receives_ypncp dag, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives Capital Income") subtitle("In initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+
+graph export "$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore
+	
+graph drop _all	
+	
+	
+* Hh income 	
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(ydses_c5 stm)
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Receives Capital Income ") ///
+	subtitle("In initial education spell, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+	
+graph drop _all		
+	
+	
+* Marital status 
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(dcpst stm)
+	
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Receives Capital Income") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are observed in their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export ///
+"$dir_validation_graphs/income/int_validation_I3a_selection_capital_init_edu_ts_all_both_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore 
+
+graph drop _all 			
+	
+		
+* I3b selection - capital income, left initial education spell 
+
+use "$dir_validation_data/I3b_selection_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_receives_ypncp`i' = 0 
+	replace pred_receives_ypncp`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+replace stm = 2000 + stm 
+egen pred_receives_ypncp = rowmean(pred_receives_ypncp0-pred_receives_ypncp19)
+
+* Raw prediction vs observed
+twoway ///
+	(histogram pred_receives_ypncp0, color(red)) ///
+	(histogram receives_ypncp, color(none) lcolor(black)), ///
+	xtitle (Receives capital income) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Receives Capital Income") ///
+	subtitle("Left initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating capital income is recieved. Estimation sample plotted. Sample contains all" "individual age 16+, who have left their initial education spell. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+	
+* Year 
+preserve
+
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(stm)
+
+twoway ///
+(line pred_receives_ypncp stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives Captial Income") ///
+	subtitle("Left initial education spell") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+restore	
+	
+graph drop _all	
+	
+
+* By gender 	
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(stm dgn)
+	
+twoway ///
+(line pred_receives_ypncp stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+twoway ///
+(line pred_receives_ypncp stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+grc1leg graph1 graph2 ,  ///
+	title("Receives Captial Income ") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore 
+
+graph drop _all 
+	
+	
+* Share by age 
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(dag)
+
+twoway ///
+(line pred_receives_ypncp dag, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives Capital Income") subtitle("In initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+
+graph export "$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore
+	
+graph drop _all	
+	
+	
+* Hh income 	
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(ydses_c5 stm)
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Receives Capital Income ") ///
+	subtitle("Left initial education spell, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have left their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+	
+graph drop _all		
+	
+	
+* Marital status 
+preserve
+	
+collapse (mean) receives_ypncp pred_receives_ypncp  [aw = dwt], by(dcpst stm)
+	
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypncp stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypncp stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Receives Capital Income") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who have their initial education spell, aged 16+ years old." "Initial education spell defined generously. Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export ///
+"$dir_validation_graphs/income/int_validation_I3b_selection_capital_left_edu_ts_all_both_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore 
+
+graph drop _all 			
+	
+	
+******************************************	
+* I3a amount - in initial education spell 
+******************************************
+use "$dir_validation_data/I3a_level_sample", clear
+
+keep if in_sample == 1 
+
+* Obtain predicted log amount 
+gen pred_ln_ypncp = p 
+
+* Obtain random component 
+cap drop epsilon
+gen epsilon = rnormal()*sigma
+
+* Convert into level with random component 
+gen pred_ypncp = exp(pred_ln_ypncp + epsilon) 
+
+* Trim predictions
+sum pred_ypncp, d
+replace pred_ypncp = . if pred_ypncp < r(p1) | pred_ypncp > r(p99)
+
+twoway (hist pred_ypncp, width(1) color(green)) ///
+	(hist ypncp_lvl, width(1) color(none) lcolor(black)), ///
+	xtitle (Capital income (GBP)) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Capital Income Amount") ///
+	subtitle("In initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of capital income received. Estimation sample plotted. Sample contains all" "individual age 16+, who are in their initial education spell. Initial education spell defined generously. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3a_amount_capital_init_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all
+	
+	
+* By gender 
+
+* Males 
+twoway (hist pred_ypncp if dgn == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Males") name(graph1, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Females 
+twoway (hist pred_ypncp if dgn == 0, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Females") name(graph2, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+	
+grc1leg graph1 graph2 ,  ///
+	title("Capital Income Amount") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) rows(1) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of capital income received. Estimation sample plotted. Sample contains all" "individual age 16+, who are in their initial education spell. Initial education spell defined generously. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3a_amount_capital_init_edu_hist_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+**********************************************
+* I3b amount - left initial education spell 
+**********************************************
+use "$dir_validation_data/I3b_level_sample", clear
+
+keep if in_sample == 1 
+
+* Obtain predicted log amount 
+gen pred_ln_ypncp = p 
+
+* Obtain random component 
+cap drop epsilon
+gen epsilon = rnormal()*sigma
+
+* Convert into level with random component 
+gen pred_ypncp = exp(pred_ln_ypncp + epsilon) 
+
+* Trim predictions
+sum pred_ypncp, d
+replace pred_ypncp = . if pred_ypncp < r(p1) | pred_ypncp > r(p99)
+
+twoway (hist pred_ypncp, width(1) color(green)) ///
+	(hist ypncp_lvl, width(1) color(none) lcolor(black)), ///
+	xtitle (Capital income (GBP)) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Capital Income Amount") ///
+	subtitle("Left initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of capital income received. Estimation sample plotted. Sample contains all" "individual age 16+, who have left their initial education spell. Initial education spell defined generously. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3b_amount_capital_left_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all
+	
+	
+* By gender 
+
+* Males 
+twoway (hist pred_ypncp if dgn == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Males") name(graph1, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Females 
+twoway (hist pred_ypncp if dgn == 0, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Females") name(graph2, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+	
+grc1leg graph1 graph2 ,  ///
+	title("Capital Income Amount") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(1) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of capital income received. Estimation sample plotted. Sample contains all" "individual age 16+, who have left their initial education spell. Initial education spell defined generously. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3b_amount_capital_left_edu_hist_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+
+* By education 	
+	
+* Low 
+twoway (hist pred_ypncp if deh_c3 == 3, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Low education") name(graph1, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Medium
+twoway (hist pred_ypncp if deh_c3 == 2, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Medium education") name(graph2, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* High	
+twoway (hist pred_ypncp if deh_c3 == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypncp_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("High education") name(graph3, replace) ///
+	xtitle (Capital income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 	
+	
+	
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Capital Income Amount") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of capital income received. Estimation sample plotted. Sample contains all" "individual age 16+, who have left their initial education spell. Initial education spell defined generously. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I3b_amount_capital_left_edu_hist_all_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+********************************
+* I4b: Amount of pension income.
+******************************** 
+
+use "$dir_validation_data/I4b_level_sample", clear
+
+keep if in_sample == 1 
+
+* Obtain predicted log amount 
+gen pred_ln_ypnoab = p 
+
+* Obtain random component 
+cap drop epsilon
+gen epsilon = rnormal()*sigma
+
+* Convert into level with random component 
+gen pred_ypnoab = exp(pred_ln_ypnoab + epsilon) 
+
+* Trim predictions
+sum pred_ypnoab, d
+replace pred_ypnoab = . if pred_ypnoab < r(p1) | pred_ypnoab > r(p99)
+
+twoway (hist pred_ypnoab, width(1) color(green)) ///
+	(hist ypnoab_lvl, width(1) color(none) lcolor(black)), ///
+	xtitle (Private Pension Income (GBP)) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individual who were retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I4b_amount_pension_retired_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all
+	
+	
+* By gender 
+
+* Males 
+twoway (hist pred_ypnoab if dgn == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Males") name(graph1, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Females 
+twoway (hist pred_ypnoab if dgn == 0, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Females") name(graph2, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+	
+grc1leg graph1 graph2 ,  ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	legendfrom(graph1) rows(1) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individuals who were retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I4b_amount_pension_retired_hist_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+
+* By education 	
+	
+* Low 
+twoway (hist pred_ypnoab if deh_c3 == 3, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Low education") name(graph1, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Medium
+twoway (hist pred_ypnoab if deh_c3 == 2, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Medium education") name(graph2, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* High	
+twoway (hist pred_ypnoab if deh_c3 == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("High education") name(graph3, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 	
+	
+	
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individuals who were retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I4b_amount_pension_retired_hist_all_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+		
+
+***********************************************************************
+* I5a selection - private pension income, not retired in the past year  
+***********************************************************************
+use "$dir_validation_data/I5a_selection_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_receives_ypnoab`i' = 0 
+	replace pred_receives_ypnoab`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+replace stm = 2000 + stm 
+egen pred_receives_ypnoab = rowmean(pred_receives_ypnoab0-pred_receives_ypnoab19)
+
+* Raw prediction vs observed
+twoway ///
+	(histogram pred_receives_ypnoab0, color(red)) ///
+	(histogram receives_ypnoab, color(none) lcolor(black)), ///
+	xtitle (Receives private pension income) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Receives private pension income") ///
+	subtitle("In initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating private pension income is recieved. Estimation sample plotted. Sample contains all" "individuals who were not retired last year.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+	
+* Year 
+preserve
+
+collapse (mean) receives_ypnoab pred_receives_ypnoab  [aw = dwt], by(stm)
+
+twoway ///
+(line pred_receives_ypnoab stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives Captial Income") ///
+	subtitle("In initial education spell") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals individuals who were not retired last year." "Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+restore	
+	
+graph drop _all	
+	
+
+* By gender 	
+preserve
+	
+collapse (mean) receives_ypnoab pred_receives_ypnoab  [aw = dwt], by(stm dgn)
+	
+twoway ///
+(line pred_receives_ypnoab stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+twoway ///
+(line pred_receives_ypnoab stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) 
+
+grc1leg graph1 graph2 ,  ///
+	title("Receives Captial Income ") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals individuals who were not retired last year." "Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore 
+
+graph drop _all 
+	
+	
+* Share by age 
+preserve
+	
+collapse (mean) receives_ypnoab pred_receives_ypnoab  [aw = dwt], by(dag)
+
+twoway ///
+(line pred_receives_ypnoab dag, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Receives private pension income") subtitle("In initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who individuals who were not retired last year." "Predictions are the average over 20 random draws.", size(vsmall))	
+
+graph export "$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+restore
+	
+graph drop _all	
+	
+	
+* Hh income 	
+preserve
+	
+collapse (mean) receives_ypnoab pred_receives_ypnoab  [aw = dwt], by(ydses_c5 stm)
+
+twoway ///
+(line pred_receives_ypnoab stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph4) title("Fourth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Receives private pension income ") ///
+	subtitle("In initial education spell, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who were not retired last year." "Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export "$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+	
+graph drop _all		
+	
+	
+* Marital status 
+preserve
+	
+collapse (mean) receives_ypnoab pred_receives_ypnoab  [aw = dwt], by(dcpst stm)
+	
+twoway ///
+(line pred_receives_ypnoab stm if dcpst == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if dcpst == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if dcpst == 2, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if dcpst == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Single") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_receives_ypnoab stm if dcpst == 3, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line receives_ypnoab stm if dcpst == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph3) title("Previously partnered") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Receives private pension income") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who were not retired last year." "Predictions are the average over 20 random draws.", size(vsmall))	
+	
+graph export ///
+"$dir_validation_graphs/income/int_validation_I5a_selection_private_pension_notretired_ts_all_both_partnership.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore 
+
+graph drop _all 			
+	
+********************************
+* I5b: Amount of pension income.
+******************************** 
+
+use "$dir_validation_data/I5a_level_sample", clear
+
+keep if in_sample == 1 
+
+* Obtain predicted log amount 
+gen pred_ln_ypnoab = p 
+
+* Obtain random component 
+cap drop epsilon
+gen epsilon = rnormal()*sigma
+
+* Convert into level with random component 
+gen pred_ypnoab = exp(pred_ln_ypnoab + epsilon) 
+
+* Trim predictions
+sum pred_ypnoab, d
+replace pred_ypnoab = . if pred_ypnoab < r(p1) | pred_ypnoab > r(p99)
+
+twoway (hist pred_ypnoab, width(1) color(green)) ///
+	(hist ypnoab_lvl, width(1) color(none) lcolor(black)), ///
+	xtitle (Private Pension Income (GBP)) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individual who were not retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I5a_amount_pension_retired_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all
+	
+	
+* By gender 
+
+* Males 
+twoway (hist pred_ypnoab if dgn == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Males") name(graph1, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Females 
+twoway (hist pred_ypnoab if dgn == 0, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Females") name(graph2, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+	
+grc1leg graph1 graph2 ,  ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	legendfrom(graph1) rows(1) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individuals who were not retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I5a_amount_pension_retired_hist_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+
+* By education 	
+	
+* Low 
+twoway (hist pred_ypnoab if deh_c3 == 3, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 1, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Low education") name(graph1, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* Medium
+twoway (hist pred_ypnoab if deh_c3 == 2, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("Medium education") name(graph2, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 
+	
+* High	
+twoway (hist pred_ypnoab if deh_c3 == 1, width(1) color(green) ///
+	legend(lab(1 "Predicted"))) ///
+(histogram ypnoab_lvl if dgn == 0, width(1) color(none) lcolor(black) ///
+	legend(lab( 2 "Observed"))), ///
+	subtitle("High education") name(graph3, replace) ///
+	xtitle (Private Pension Income (GBP)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) 	
+	
+	
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Private Pension Income Amount") ///
+	subtitle("Retired in the past year") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Predicted vs observed of Private Pension Income received. Estimation sample plotted. Sample contains all" "individuals who were not retired in the previous year. GBP per year, in 2015 prices." "Top and bottom percentiles of predicted trimmed.", size(vsmall))
+
+graph export "$dir_validation_graphs/income/int_validation_I5a_amount_pension_retired_hist_all_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+		
+
+	
+	graph drop _all 	
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_leave_parental_home.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_leave_parental_home.do
new file mode 100644
index 000000000..e1f525c77
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_leave_parental_home.do
@@ -0,0 +1,300 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Leave parental home
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova 
+* LAST UPDATE:		July 2025
+* COUNTRY: 			UK
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					leaving the parental home process. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_leave_parental_home.do"
+********************************************************************************
+
+************************************
+* Process P1a: Leave Parental Home *
+************************************
+
+* Overall 
+use "$dir_validation_data/P1a_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dlftphm = 0 
+replace pred_dlftphm = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dlftphm, color(red)) ///
+	(histogram dlftphm, color(none) lcolor(black)), ///
+	xtitle (Leave) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Leaving the Parental Home") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating whether individual leaves the parental home given they were observed last year" "living with parents. Estimation sample plotted. Sample contains all 18+.", size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 
+	
+	
+* Year 
+use "$dir_validation_data/P1a_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dlftphm`i' = 0 
+	replace pred_dlftphm`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dlftphm pred_dlftphm* [aw = dwt], by(stm)
+
+order pred_dlftphm*
+
+egen pred_dlftphm = rowmean(pred_dlftphm0-pred_dlftphm19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlftphm stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dlftphm stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Leaving the Parental Home") xtitle("Year") ytitle("Share")  ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes:  Estimation sample plotted. Sample contains individuals who are 18+. ", size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+
+
+* Gender 
+preserve
+
+collapse (mean) dlftphm pred_dlftphm* [aw = dwt], by(dgn stm)
+
+order pred_dlftphm*
+
+egen pred_dlftphm = rowmean(pred_dlftphm0-pred_dlftphm19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlftphm stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlftphm stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlftphm stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlftphm stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 ,  ///
+	title("Leaving the Parental Home") ///
+	legendfrom(graph1)  ///
+	graphregion(color(white)) ///
+	note("Notes:  Estimation sample plotted. Sample contains individuals who are 18+.", ///
+	size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all  
+
+restore 
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dlftphm pred_dlftphm* [aw = dwt], by(dag)
+
+order pred_dlftphm*
+
+egen pred_dlftphm = rowmean(pred_dlftphm0-pred_dlftphm19)
+
+twoway ///
+(line pred_dlftphm dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dlftphm dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Leaving the Parental Home") subtitl("Share by age") ///
+xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes:  Estimation sample plotted. Sample contains individuals who are 18+.", ///
+	size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dlftphm pred_dlftphm* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dlftphm*
+
+egen pred_dlftphm = rowmean(pred_dlftphm0-pred_dlftphm19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlftphm stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dlftphm stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlftphm stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlftphm stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlftphm stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dlftphm stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Leaving the Parental Home") ///
+	subtitle("By hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes:  Estimation sample plotted. Sample contains individuals who are 18+.", ///
+	size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Education 
+preserve
+
+collapse (mean) dlftphm pred_dlftphm* [aw = dwt], by(deh_c3 stm)
+
+order pred_dlftphm*
+
+egen pred_dlftphm = rowmean(pred_dlftphm0-pred_dlftphm19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dlftphm stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dlftphm stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dlftphm stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dlftphm stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+	
+grc1leg graph1 graph2 graph3,  ///
+	title("Leaving the parental home") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes:  Estimation sample plotted. Sample contains individuals who are 18+.", ///
+	size(vsmall))
+
+graph export "$dir_validation_graphs/leave_parental_home/int_validation_P1a_leave_parental_home_ts_all_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_partnership.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_partnership.do
new file mode 100644
index 000000000..b93b3c2a3
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_partnership.do
@@ -0,0 +1,703 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Partnership
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova  
+* LAST UPDATE:		July 2025 
+* COUNTRY: 			UK  
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					partnership processes. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_partnership.do"
+********************************************************************************
+
+****************************************************
+* U1a: Partnership formation, in initial edu spell *
+****************************************************
+
+* Overall 
+use "$dir_validation_data/U1a_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dcpen = 0 
+replace pred_dcpen = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+(histogram pred_dcpen, color(red)) ///
+(histogram dcpen, color(none) lcolor(black)), ///
+	xtitle (Formation) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Partnership Formation") ///
+	subtitle("In initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indiciating forming a partnership. Estimation sample plotted. Sample contains individuals" "who are in their initial education spell and 18-29 years old. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1a_partnership_init_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+
+
+* Year 
+use "$dir_validation_data/U1a_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dcpen`i' = 0 
+	replace pred_dcpen`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpen stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Partnership Formation") ///
+	subtitle("In initial education spell") ///
+	xtitle("Year") ytitle("Share")  ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18-29 years old and in their initial education spell. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1a_partnership_init_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+graph drop _all 
+ 
+restore  
+
+
+* Gender 
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(dgn stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2,  ///
+	title("Partnership Formation") ///
+	subtitle("In initial education spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18-29 years old and in their initial education spell. Initial education spell" "defined generously.", size(vsmall))
+	
+	
+graph export "$dir_validation_graphs/partnership/int_validation_U1a_partnership_init_edu_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+	
+graph drop _all  
+
+restore 
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(dag)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+twoway ///
+(line pred_dcpen dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpen dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Partnership Formation") ///
+	subtitle("In initial education spall, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and in their initial education spell. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1a_partnership_init_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Partnership Formation") ///
+	subtitle("In initial education spell, by hh disposable income") /// 
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18-29 years old and in their initial education spell. Initial education spell" "defined generously.", size(vsmall))
+	
+graph export "$dir_validation_graphs/partnership/int_validation_U1a_partnership_init_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)	
+
+graph drop _all 	
+	
+restore
+
+
+******************************************************
+* U1b: Partnership formation, left initial edu spell *
+******************************************************
+
+* Overall 
+use "$dir_validation_data/U1b_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dcpen = 0 
+replace pred_dcpen = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dcpen, color(red)) ///
+	(histogram dcpen, color(none) lcolor(black)), ///
+	xtitle (Formation) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Partnership Formation") ///
+	subtitle("Left initial education spell") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating forming a partnership. Estimation sample plotted. Sample contains individuals" "who are 18+ years old and have left their initial education spell. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1b_partnership_left_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+* Year 
+use "$dir_validation_data/U1b_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dcpen`i' = 0 
+	replace pred_dcpen`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpen stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Partnership Formation") ///
+	subtitle("Left initial education spell") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left initial education spell. Initial" "education spell defined generously.", size(vsmall))
+
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1b_partnership_left_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+
+
+* Gender 
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(dgn stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm if dgn == 0, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if dgn == 0, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("Females") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if dgn == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if dgn == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph2) title("Males") xtitle("Year") ytitle("Share")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2,  ///
+	title("Partnership Formation") ///
+	subtitle("Left initial education spell") ///
+	legendfrom(graph1) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spall. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1b_partnership_left_edu_ts_all_gender.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all  
+
+restore 
+ 
+ 
+* Age
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(dag)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+twoway ///
+(line pred_dcpen dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpen dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+	title("Partnership Formation") ///
+	subtitle("Left initial education spell, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spell. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1b_partnership_left_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dcpen pred_dcpen* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dcpen*
+
+egen pred_dcpen = rowmean(pred_dcpen0-pred_dcpen19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpen stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpen stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpen stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Partnership Formation") ///
+	subtitle("Left initial education spell, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individual who are 18+ years old and have left their initial education spell. Initial" "education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U1b_partnership_left_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+**********************************************************
+* U2b: Partnership termination, not in initial edu spell *
+**********************************************************
+
+* Overall 
+use "$dir_validation_data/U2b_sample", clear 
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_dcpex = 0 
+replace pred_dcpex = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_dcpex, color(red)) ///
+	(histogram dcpex, color(none) lcolor(black)), ///
+	xtitle (Formation) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Partnership Termination") ///
+	subtitle("Left initial education spell")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indiciating ending a partnership. Estimation sample plotted. Sample contains individuals" "who have left their initial education spell and are 18+ years old. Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U2b_separation_left_edu_hist_all.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+	
+* Year 
+use "$dir_validation_data/U2b_sample", clear 
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_dcpex`i' = 0 
+	replace pred_dcpex`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) dcpex pred_dcpex* [aw = dwt], by(stm)
+
+order pred_dcpex*
+
+egen pred_dcpex = rowmean(pred_dcpex0-pred_dcpex19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpex stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpex stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Partnership Termination") ///
+	subtitle("Left initial education spell")  ///
+	xtitle("Year") ytitle("Share")  ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spell." "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U2b_separation_left_edu_ts_all_both.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all
+	
+restore  
+
+ 
+* Age
+preserve
+
+collapse (mean) dcpex pred_dcpex* [aw = dwt], by(dag)
+
+order pred_dcpex*
+
+egen pred_dcpex = rowmean(pred_dcpex0-pred_dcpex19)
+
+twoway ///
+(line pred_dcpex dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line dcpex dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Partnership Termination") ///
+	subtitle("Left initial education spell")  ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spell." "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U2b_separation_left_edu_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) dcpex pred_dcpex* [aw = dwt], by(ydses_c5 stm)
+
+order pred_dcpex*
+
+egen pred_dcpex = rowmean(pred_dcpex0-pred_dcpex19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpex stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpex stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dcpex stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dcpex stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dcpex stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_dcpex stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+title("Partnership Termination") ///
+	subtitle("Left initial education spell by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spell." "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U2b_separation_init_edu_ts_all_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
+
+
+* Education
+preserve
+
+collapse (mean) dcpex pred_dcpex* [aw = dwt], by(deh_c3 stm)
+
+order pred_dcpex*
+
+egen pred_dcpex = rowmean(pred_dcpex0-pred_dcpex19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_dcpex stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line dcpex stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpex stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_dcpex stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line dcpex stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3,  ///
+title("Partnership Termination") ///
+	subtitle("Left initial education spell")  ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 18+ years old and have left their initial education spell." "Initial education spell defined generously.", size(vsmall))
+
+graph export "$dir_validation_graphs/partnership/int_validation_U2b_separation_init_edu_ts_all_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 	
+	
+restore
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_retirement.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_retirement.do
new file mode 100644
index 000000000..355849eaf
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_retirement.do
@@ -0,0 +1,492 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Retirement
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova 
+* LAST UPDATE:		1 July 2025 
+* COUNTRY: 			UK 
+
+* NOTES: 			Compares predicted values to the observed values of the 
+* 					2 retirement processes estimated. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_retirement.do"
+********************************************************************************
+
+****************************
+* R1a: Retirement - Single *
+****************************
+
+* Overall
+use "$dir_validation_data/R1a_sample", clear
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_drtren = 0 
+replace pred_drtren = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_drtren, color(red)) ///
+	(histogram drtren, color(none) lcolor(black) ), ///
+	xtitle (Retired) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Retirement") ///
+	subtitle("Non-partnered") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating retire. Estimation sample plotted. Sample contains individuals" "who are 50+ years old years old and do not liv with a partner.", size(vsmall))
+	
+graph export "$dir_validation_graphs/retirement/int_validation_R1a_retirement_single_hist_50.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+
+* Year 
+use "$dir_validation_data/R1a_sample", clear
+
+// construct multiple versions of the predicted outcome allowing for different 
+// random draws 
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_drtren`i' = 0 
+	replace pred_drtren`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+// for each iteration calculate the share that leave edu 
+collapse (mean) drtren pred_drtren* [aw = dwt], by(stm)
+
+order pred_drtren*
+
+// take the average across datasets 
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line drtren stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Retirement") ///
+	subtitle("Non-partnered") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and do not live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1a_retirement_single_ts_50.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+graph drop _all 
+ 
+restore  
+ 
+ 
+* Age
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(dag)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+twoway ///
+(line pred_drtren dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line drtren dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Retirement") subtitle("Non-partnered, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and do not live with a partner.", size(vsmall))
+
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1a_retirement_single_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(ydses_c5 stm)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line drtren stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Retirement single") ///
+	subtitle("Non-partnered, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and do not live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1a_retirement_single_ts_50_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+	
+graph drop _all 	
+	
+restore
+
+
+* Education 
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(deh_c3 stm)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line drtren stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Retirement") ///
+	subtitle("Non-partnered") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and do not live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1a_retirement_single_ts_50_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+
+graph drop _all 	
+	
+restore
+
+
+*******************************
+* R1b: Retirement - Partnered *
+*******************************
+
+* Overall
+use "$dir_validation_data/R1b_sample", clear
+
+set seed 12345
+gen rnd = runiform() 	
+gen pred_drtren = 0 
+replace pred_drtren = 1 if inrange(p,rnd,1)
+
+keep if in_sample == 1 
+
+twoway ///
+	(histogram pred_drtren, color(red)) ///
+	(histogram drtren, color(none) lcolor(black)), ///
+	xtitle (Retired) ///
+	legend(lab(1 "Predicted") lab( 2 "Observed")) name(levels, replace) ///
+	title("Retirement") ///
+	subtitle("Partnered") ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Predicted vs observed of dummy indicating retire. Estimation sample plotted. Sample contains individuals who are 50+ years old and live with a partner.", size(vsmall))
+	
+graph export "$dir_validation_graphs/retirement/int_validation_R1b_retirement_partnered_hist_50.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+
+* Year 
+use "$dir_validation_data/R1b_sample", clear
+
+forvalues i = 0/19 {
+	local my_seed = 12345 + `i'  
+    set seed `my_seed' 	
+	gen rnd = runiform() 	
+	gen pred_drtren`i' = 0 
+	replace pred_drtren`i' = 1 if inrange(p,rnd,1)
+	drop rnd
+}
+
+keep if in_sample == 1 
+
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(stm)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm, sort color(green) legend(label(1 "Predicted"))) ///
+(line drtren stm, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Retirement") ///
+	subtitle("Partnered") ///
+	xtitle("Year") ytitle("Share") ///
+	graphregion(color(white)) ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1b_retirement_partnered_ts_50.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+ 
+restore  
+ 
+ 
+* Age
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(dag)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+twoway ///
+(line pred_drtren dag, sort color(green) legend(label(1 "Predicted"))) ///
+(line drtren dag, sort color(green) color(green%20) lpattern(dash) ///
+	legend(label(2 "Observed"))), ///
+title("Retirement") subtitle("Partnered, share by age") ///
+	xtitle("Age") ///
+	ytitle("Share") xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1b_retirement_partnered_share_age.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+restore
+
+
+* Income 
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(ydses_c5 stm)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line drtren stm if ydses_c5 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("First quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Second quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Third quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 4, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 4, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph4) title("Forth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if ydses_c5 == 5, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if ydses_c5 == 5, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph5) title("Fifth quintile") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+grc1leg graph1 graph2 graph3 graph4 graph5,  ///
+	title("Retirement") ///
+	subtitle("Partnered, by hh disposable income") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1b_retirement_partnered_ts_50_both_income.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+	
+graph drop _all 	
+	
+restore
+
+
+* Education 
+preserve
+
+collapse (mean) drtren pred_drtren* [aw = dwt], by(deh_c3 stm)
+
+order pred_drtren*
+
+egen pred_drtren = rowmean(pred_drtren0-pred_drtren19)
+
+replace stm = 2000 + stm 
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 1, sort color(green) ///
+	legend(label(1 "Predicted"))) ///
+(line drtren stm if deh_c3 == 1, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Observed"))), ///
+name(graph1) title("High education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 2, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if deh_c3 == 2, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph2) title("Medium education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+twoway ///
+(line pred_drtren stm if deh_c3 == 3, sort color(green) ///
+	legend(label(1 "Pred"))) ///
+(line drtren stm if deh_c3 == 3, sort color(green) color(green%20) ///
+	lpattern(dash) legend(label(2 "Obs"))), ///
+name(graph3) title("Low education") xtitle("Year") ytitle("")  ///
+	xlabel(, labsize(small)) ylabel(, labsize(small)) ///
+	legend(size(small)) ///
+	graphregion(color(white))
+
+
+grc1leg graph1 graph2 graph3 ,  ///
+	title("Retirement") ///
+	subtitle("Partnered") ///
+	legendfrom(graph1) rows(2) ///
+	graphregion(color(white)) ///
+	note("Notes: Estimation sample plotted. Sample contains individuals who are 50+ years old and live with a partner.", size(vsmall))
+
+graph export "$dir_validation_graphs/retirement/int_validation_R1b_retirement_partnered_ts_50_both_edu.png", ///
+	as(png) replace width(2560) height(1440) //quality(100)
+
+
+graph drop _all 	
+	
+restore
+
+
+
diff --git a/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_wages.do b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_wages.do
new file mode 100644
index 000000000..a3c3b916e
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/internal_validation/int_val_wages.do
@@ -0,0 +1,125 @@
+********************************************************************************
+* PROJECT:  		ESPON 
+* SECTION:			Wages
+* OBJECT: 			Internal validation
+* AUTHORS:			Ashley Burdett, Daria Popova 
+* LAST UPDATE:		May 2025
+* COUNTRY: 			UK   
+
+* NOTES: 			Compares predicted values to the observed values. 
+* 					Individual heterogeneity added to the standard predicted 
+* 					values using a random draw like in stochasitic 
+* 					imputation. The pooled mean is obtained as in multiple 
+* 					imputation by repeating the random draw 20 times for each 
+* 					process. 
+* 
+* 					Run after "reg_wages.do"
+********************************************************************************
+
+* Female - No previous wage 
+
+use "$dir_validation_data/Female_NPW_sample", clear
+
+* Correct bias when transforming from log to levels 
+cap drop epsilon
+gen epsilon = rnormal()*e(sigma) 
+
+replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if in_sample_fnpw 
+ 
+twoway (hist pred_hourly_wage if pred_hourly_wage<150 & in_sample_fnpw == 1, ///
+		width(1) color(red)) ///
+	(hist wage_hour if wage_hour<150 & in_sample_fnpw == 1, width(1) ///
+	color(none) lcolor(black)), ///
+	title("Hourly Wages") ///
+	subtitle("Females, no previous wage observed") ///
+	xtitle (Gross hourly wages (GBP)) legend(lab(1 "Observed") ///
+	lab( 2 "Predicted")) name(log, replace) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Sample includes working age (18-64) females. Predictions obtained from the estimates of  a Heckman model.", size(vsmall))
+	
+graph export ///
+	"$dir_validation_graphs/wages/int_validation_wages_hist_f_npw.png", replace as(png) width(2560) height(1440)
+	
+	
+* Male - No previous wage 
+
+use "$dir_validation_data/Male_NPW_sample", clear
+
+* Correct bias when transforming from log to levels 
+cap drop epsilon
+gen epsilon = rnormal()*e(sigma) 
+
+replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if in_sample_mnpw 
+ 
+twoway (hist pred_hourly_wage if  pred_hourly_wage<150 & in_sample_mnpw == 1, ///
+		width(1) color(red)) ///
+	(hist wage_hour if  wage_hour<150 & in_sample_mnpw == 1, width(1) ///
+	color(none) lcolor(black)), ///
+	title("Hourly Wages") ///
+	subtitle("Males, no previous wage observed") ///
+	xtitle (Gross hourly wages (GBP)) legend(lab(1 "Observed") ///
+	lab( 2 "Predicted")) name(log, replace) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Sample includes working age (18-64) males. Predictions obtained from the estimates of  a Heckman model.", size(vsmall))
+	
+graph export ///
+	"$dir_validation_graphs/wages/int_validation_wages_hist_m_npw.png", replace as(png) width(2560) height(1440)
+	
+	
+* Female - Previous wage 
+
+use "$dir_validation_data/Female_PW_sample", clear
+
+* Correct bias when transforming from log to levels 
+cap drop epsilon
+gen epsilon = rnormal()*e(sigma) 
+
+replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if in_sample_fpw 
+ 
+twoway (hist pred_hourly_wage if  pred_hourly_wage<150 & in_sample_fpw == 1, ///
+		width(1) color(red)) ///
+	(hist wage_hour if  wage_hour<150 & in_sample_fpw == 1, width(1) ///
+	color(none) lcolor(black)), ///
+	title("Hourly Wages") ///
+	subtitle("Females, previous wage observed") ///
+	xtitle (Gross hourly wages (GBP)) legend(lab(1 "Observed") ///
+	lab( 2 "Predicted")) name(log, replace) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Sample includes working age (18-64) females. Predictions obtained from the estimates of  a Heckman model.", size(vsmall))
+	
+graph export ///
+	"$dir_validation_graphs/wages/int_validation_wages_hist_f_pw.png", replace as(png) width(2560) height(1440)
+	
+	
+* Male - Previous wage 
+
+use "$dir_validation_data/Male_PW_sample", clear
+
+* Correct bias when transforming from log to levels 
+cap drop epsilon
+gen epsilon = rnormal()*e(sigma) 
+
+replace pred_hourly_wage = exp(lwage_hour_hat + epsilon) if in_sample_mpw 
+ 
+twoway (hist pred_hourly_wage if  pred_hourly_wage<150 & in_sample_fpw == 1, ///
+		width(1) color(red)) ///
+	(hist wage_hour if  wage_hour<150 & in_sample_fpw == 1, width(1) ///
+	color(none) lcolor(black)), ///
+	title("Hourly Wages") ///
+	subtitle("Males, previous wage observed") ///
+	xtitle (Gross hourly wages (GBP)) legend(lab(1 "Observed") ///
+	lab( 2 "Predicted")) name(log, replace) ///
+	graphregion(color(white)) ///
+	legend(size(small)) ///
+	note("Notes: Sample includes working age (18-64) males. Predictions obtained from the estimates of  a Heckman model.", size(vsmall))
+	
+graph export ///
+	"$dir_validation_graphs/wages/int_validation_wages_hist_m_pw.png", replace as(png) width(2560) height(1440)	
+	
+
+graph drop _all
+
+
diff --git a/input/InitialPopulations/compile/RegressionEstimates/master.do b/input/InitialPopulations/compile/RegressionEstimates/master.do
new file mode 100644
index 000000000..d2a148963
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/master.do
@@ -0,0 +1,138 @@
+
+***************************************************************************************
+* PROJECT:              ESPON: regression estimates for SimPaths using UKHLS data 
+* DO-FILE NAME:         master.do
+* DESCRIPTION:          Main do-file to set the main parameters (country, paths) and call sub-scripts
+***************************************************************************************
+* COUNTRY:              UK
+* DATA:         	    UKHLS EUL version - UKDA-6614-stata [to wave n]
+*
+* AUTHORS: 				Daria Popova, Justin van de Ven
+* LAST UPDATE:          1 july 2025 DP  
+***************************************************************************************
+
+***************************************************************************************
+* General comments:
+* - Note that in the following scripts some standard commands may be 
+*   abbreviated: (gen)erate, (tab)ulate, (sum)marize, (di)splay, 
+*   (cap)ture, (qui)etly, (noi)sily
+
+*Stata packages to install 
+*ssc install fre
+*ssc install tsspell 
+*ssc install carryforward 
+*ssc install outreg2
+*ssc install oparallel
+*ssc install gologit2
+* NOTES: 				Output formatting automated, however if you decide to 
+* 						add or take-away variables from the processes you 
+* 						will need to update the labelling in the excel files. 
+*                        						 
+* 						The income and union parameter do file must be run after
+* 						the wage estimates are obtain because they use 
+* 						predicted wages. The order of the remaining files is
+* 						arbitrary. 
+***************************************************************************************
+***************************************************************************************
+
+clear all
+set more off
+set type double
+set maxvar 30000
+set matsize 1000
+
+
+/**************************************************************************************
+* DEFINE DIRECTORIES
+**************************************************************************************/
+
+* Working directory
+global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates"
+
+* Directory which contains do files
+global dir_do "${dir_work}/do"
+
+* Directory which contains log files 
+global dir_log "${dir_work}/log"
+
+* Directory which contains raw output: Excel and Word tables 
+global dir_raw_results "${dir_work}/raw_results"
+
+* Directory which contains final Excel files read by the model  
+global dir_results "${dir_work}/results"
+
+* Directory which contains pooled dataset for estimates  
+global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data"
+
+* Directory containing external input data 
+global dir_external_data "$dir_work/external_data"
+
+* Directory containing results of comparison of various weights   
+global weight_checks "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\weight_checks"
+
+*********************Internal validation****************************************
+* Directory to save data for internal validation 
+global dir_validation_data "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\data"
+
+* Directory for internal validation do-files 
+global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files"
+
+* Directory for internal validation do-files 
+global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files"
+
+* Directory for internal validation do-files 
+global dir_validation_graphs "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\graphs"
+
+global countyy "UK" 
+
+/*******************************************************************************
+* ESTIMATION FILES
+*******************************************************************************/
+
+do "${dir_do}/reg_education.do"
+
+/*
+do "${dir_do}/reg_leaveParentalHome.do"
+
+do "${dir_do}/reg_partnership.do"
+
+do "${dir_do}/reg_fertility.do"
+
+do "${dir_do}/reg_health.do"
+
+do "${dir_do}/reg_home_ownership.do"
+
+do "${dir_do}/reg_retirement.do"
+
+do "${dir_do}/reg_wages.do"
+
+do "${dir_do}/reg_income.do"
+
+
+
+/*
+*******************************************************************************
+* INTERNAL VALIDATION FILES
+*******************************************************************************/
+
+do "$dir_do_validation/int_val_education.do"	
+
+do "$dir_do_validation/int_val_leave_parental_home.do"	
+
+do "$dir_do_validation/int_val_partnership.do"	
+
+do "$dir_do_validation/int_val_fertility.do"	
+
+do "$dir_do_validation/int_val_health.do"	
+
+do "$dir_do_validation/int_val_home_ownership.do"	
+
+do "$dir_do_validation/int_val_retirement.do"	
+
+do "$dir_do_validation/int_val_wages.do"	
+
+do "$dir_do_validation/int_val_income.do"	
+
+/**************************************************************************************
+* END OF FILE
+**************************************************************************************/
diff --git a/input/InitialPopulations/compile/RegressionEstimates/parametric_matching_process.do b/input/InitialPopulations/compile/RegressionEstimates/parametric_matching_process.do
new file mode 100644
index 000000000..83acf2288
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/parametric_matching_process.do
@@ -0,0 +1,216 @@
+**********************************************************************************************
+*Do file producing estimates for the parametric couple matching process in the Simpaths model
+*Author: Patryk Bronka, Daria Popova 
+*Last edit: Daria Popova 
+*Date: 4 Ju;y 2025  
+**********************************************************************************************
+clear all
+set more off
+set mem 200m
+set type double
+//set maxvar 120000
+set maxvar 30000
+
+*ssc install extremes
+
+/*******************************************************************************
+*	DEFINE DIRECTORIES
+*******************************************************************************/
+* Working directory
+global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\union_parametrisation"
+
+* Directory which contains pooled UKHLS dataset 
+global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data"
+
+* Set Excel file 
+
+* Info sheet
+putexcel set "$dir_work/scenario_parametricMatching", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Estimates for the parametric couple matching process"
+putexcel A2 = "Authors:	Patryk Bronka, Daria Popova" 
+putexcel A3 = "Last edit: 4 July 2025 DP"
+
+
+*******************************************************************************************************************************
+*1. Load initial population data 
+*import delimited $gitFolder\population_UK_initial.csv, clear
+use "$dir_ukhls_data/ukhls_pooled_all_obs_10.dta", clear //note this is a pooled dataset after Heckman has been estimated  
+
+sort idperson stm  
+xtset idperson stm 
+gen newMarriage = (idpartner > 0 & idpartner<.) & (l.idpartner<= 0 | l.idpartner>=.)
+*Note: individuals whose dcpyy (number of years in a partnership) equals 1, are newly married
+
+save "$dir_work/parametricUnionDataset", replace 
+
+
+*2. Use wages predicted using wage equation:
+sum pred_hourly_wage if dgn == 0
+sum pred_hourly_wage if dgn == 1
+
+gen predictedWage=pred_hourly_wage
+
+*3. Keep only those above 18 as that's the minimum age to get married in the simulation
+keep if dag >= 18
+
+*4. Look at newly matched couples in the initial population (this requires the longitudinal component). 
+*This has been added to the input data file as newMarriage variable
+tempfile partners
+preserve
+keep if dgn == 0 //All partners female
+keep stm idperson idhh dgn dag predictedWage
+rename idperson idpartner
+rename dag dagPartner
+rename predictedWage predictedWagePartner
+rename dgn dgnPartner
+save `partners', replace
+restore 
+
+//Keep only newly matched people
+drop if idpartner < 0 | missing(idpartner) 
+keep if newMarriage
+keep if dgn == 1
+
+merge 1:1 stm idpartner using `partners', keep(matched)
+
+
+*4. Look at the difference in wage and age of the newly matched couples
+*The first partner should probably always have the same gender, so calculate the difference between male - female
+gen dagDifference = dag - dagPartner
+
+/*check for outliers in wages*/
+sum predictedWage , d
+sum predictedWage [weight=disclwt], d
+
+extremes predictedWage, n(20) freq high
+/*
+freq:	predict~e	
+		
+1	193.11859	
+1	198.58132	
+1	198.68025	
+1	204.45788	
+1	231.91774	
+		
+1	236.29345	
+1	240.13629	
+1	246.56729	
+1	307.37445	
+1	309.38673	
+		
+1	335.10219	
+1	346.1395	
+1	371.56325	
+1	426.89188	
+1	427.71505	
+		
+1	452.59122	
+1	513.48099	
+1	516.99593	
+1	696.44839	
+1	982.29694	
+*/
+
+extremes predictedWagePartner, n(20) freq high
+/*
+freq:	predict~r	
+		
+1	148.58268	
+1	151.67459	
+2	153.68723	
+1	154.69337	
+1	186.93118	
+		
+1	191.78429	
+1	212.0091	
+1	221.97558	
+1	222.76736	
+1	274.04278	
+		
+1	277.90405	
+1	288.26281	
+1	301.90966	
+1	305.08388	
+1	330.18868	
+		
+1	426.80633	
+1	478.99185	
+1	482.67028	
+1	641.02564	
+1	952.05343	
+*/
+
+*Trim outliers 
+foreach var in predictedWage predictedWagePartner {
+centile `var', centile(1 99)
+scalar p1 = r(c_1)
+scalar p99 = r(c_2)
+replace `var' = p1 if `var' < p1
+replace `var' = p99 if `var' > p99 & !missing(`var')
+}
+
+gen predictedWageDifference = predictedWage - predictedWagePartner 
+drop if missing(dagDifference) | missing(predictedWageDifference)
+sum predictedWageDifference, d
+//sum predictedWageDifference [weight=disclwt], d
+
+
+*5. Plot the distribution of wage and age differentials against a normal distribution
+hist dagDifference, frequency normal
+
+hist predictedWage, frequency normal
+hist predictedWagePartner, frequency normal
+hist predictedWageDifference, frequency normal
+
+
+ 
+*6. Obtain the parameters for the bivariate normal distribution 
+*Sample moments are a good enough approximation to the true parameters?
+sum dagDifference predictedWageDifference //Get sample mean and std dev
+
+putexcel set "$dir_work/scenario_parametricMatching", sheet("Parameters") modify 
+putexcel A1=("Parameter") 
+putexcel A2=("mean_dag_diff")
+putexcel A3=("mean_wage_diff")
+putexcel A4=("var_dag_diff")
+putexcel A5=("var_wage_diff")
+putexcel A6=("cov_dag_wage_diff")
+putexcel B1=("Value")
+
+qui sum dagDifference 
+putexcel B2=matrix(r(mean)')
+putexcel B4=matrix(r(Var)')
+
+qui sum predictedWageDifference
+putexcel B3=matrix(r(mean)')
+putexcel B5=matrix(r(Var)')
+
+corr dagDifference predictedWageDifference, cov 
+return list
+matrix list r(C) //Get variance-covariance matrix
+
+putexcel B6=matrix(r(cov_12)') 
+
+
+*rho x,y = cov x,y / (sigma x * sigma y), which is equivalent to corr dagDifference predictedWageDifference
+corr dagDifference predictedWageDifference
+
+/* 
+Mean dagDifference = -2.19378
+Sigma dagDifference = 5.472693 // Variance is 29.950369, Bessel corrected variance is 30.02219242685851, so corrected sigma is 5.47925108266253
+Mean predictedWageDifference = -6.563083
+Sigma predictedWageDifference = 4.282041 //Variance is 18.335874, Bessel corrected variance is 18.37984492086331, so corrected sigma is 4.287172135669771
+
+rho = cov(x,y) / (sigma(x)*sigma(y)) = 6.1343291 / (5.472693*4.282041) = 0.261767... ~ 0.2618 which is equivalent to correlation of dagDifference and predictedWageDifference
+
+Bessel's correction to get the unbiased estimator:
+*/
+
+scalar BesselCorrection = _N / (_N - 1)
+di BesselCorrection
+
+*Corrected rho:
+qui corr dagDifference predictedWageDifference
+di "Small sample corrected rho:"
+di r(rho) * BesselCorrection
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do b/input/InitialPopulations/compile/RegressionEstimates/reg_education.do
index f5485e6e2..1b25447c9 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_education.do
@@ -1,10 +1,15 @@
 ********************************************************************************
 * PROJECT:  		ESPON 
 * SECTION:			Education
-* OBJECT: 			Final Probit Models - Weighted
-* AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
-********************************************************************************
+* OBJECT: 			Final Probit & Generalised Logit Models - Weighted
+* AUTHORS:			Patryk Bronka, Daria Popova, Justin van de Ven
+* LAST UPDATE:		1 July 2025 DP  
+* COUNTRY: 			UK  
+* 
+* NOTES: 	                   
+*                    
+********************************************************************************		
+
 clear all
 set more off
 set mem 200m
@@ -12,174 +17,899 @@ set type double
 //set maxvar 120000
 set maxvar 30000
 
+*******************************************************************
+cap log close 
+log using "${dir_log}/reg_education.log", replace
+*******************************************************************
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-* Directory which contains do files
-global dir_do "${dir_work}/do"
+do "$dir_do/variable_update"
 
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
 
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
 
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
+* Sample selection 
+drop if dag < 16
 
 
-*******************************************************************
-cap log close 
-log using "${dir_log}/reg_education.log", replace
-*******************************************************************
+xtset idperson swv
 
-use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
+* Set Excel file 
 
+* Info sheet
+putexcel set "$dir_results/reg_education", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing projection of education status"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
 
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
 
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-			
-label define gdr 1  "Male" 0 "Female"
-			
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-
-label define yn	1 "Yes" 0 "No"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children" 
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable deh_c3 "Educational Attainment: 3 Category"
-/*
-label variable dehm_c3 "Mother's Educational Attainment: 3 Category"
-label variable dehf_c3 "Father's Educational Attainment: 3 Category"
-*/
-label variable dehmf_c3 "Highest Parental Educational Attainment: 3 Category"
-label variable dhhtp_c4 "Household Type: 4 Category"
-label variable dnc "Number of Children in Household"
-label variable dnc02 "Number of Children aged 0-2 in Household"
-
-label value dgn gdr
-label value drgn1 rgna
-label value les_c3 jbf 
-label value deh_c3 dehmf_c3  /*dehm_c3 dehf_c3*/ edd 
-label value ded yn
-label value dhhtp_c4 hht
+putexcel A5 = "E1a"
+putexcel B5 = "Probit regression estimates of remaining in continuous education - individuals aged 16-29 in initial education spell"
 
-drop if dag < 16
+putexcel A6 = "E1b"
+putexcel B6 = "Probit regression estimates of returning to education - individuals aged 16-35 not in initial education spell"
 
-replace stm = stm - 2000
-fre stm 
+putexcel A7 = "E2a"
+putexcel B7 = "Generalized ordered logit regression estimates of education attainment - individuals aged 16-29 exiting education that were in initial education spell in t-1 but not in t"
+putexcel B8 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table"
+putexcel B9 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name."
 
-/*check if all covariates are available in the data*/ 
-recode ded dgn dag dagsq dehmf_c3 drgn1 stm deh_c3 les_c3 (-9=.) 
+putexcel A10 = "Notes:", bold
+putexcel B10 = "Added:  ethnicity-4 cat (dot); covid dummies (y2020 y2021)"
 
 
-xtset idperson swv
+putexcel set "$dir_results/reg_education", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold	
+
 
+************************************************************
+* E1a: Probability of Remaining in Initial Education Spell *
+************************************************************
+* Process E1a: Remaining in the initial education spell. 
+* Sample: Individuals aged 16-29 who have not left their initial education spell
+* DV: In continuous education dummy 
+* Note: Condition implies some persistence - education for the last 2 years. 
 
-**********************************
-*Probability of Being a Student  *
-**********************************
-*Process E1a: Probability of being in education. Sample: Individuals aged 16-29 in continuous education.
-*or probability of remaining in education for those who have always been in education without interruptions.
+fre ded if (dag >= 16 & dag <= 29 & l.ded == 1) 
+// was in initial education spell in the previous wave 
+// 70.1% remain in education 
 
-*sample: Individuals aged 16-29 in continuous education.	
-fre ded if (dag>=16 & dag<=29 & l.ded==1) /*was in continious education in the previous wave  */
+/*//////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot ///
+   if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E1b, dimlwt) side dec(4) 
 
-probit ded i.dgn dag dagsq ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimxwt], vce(robust)
+probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot ///
+   if (dag>=16 & dag<=29 & l.ded==1) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, disclwt) side dec(4) 
+
+probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot ///
+   if (dag>=16 & dag<=29 & l.ded==1)  [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_E1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot ///
+   if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimxwt], vce(robust)
+
+   * save raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/education", sheet("Process E1a") replace
+putexcel set "$dir_raw_results/education/education", sheet("Process E1a") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/E1a.doc", replace ///
-title("Process E1a: Probability of remaining in continuous education - individuals aged 16-29 in continuous education.") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/education/E1a.doc", replace ///
+title("Process E1a: Probability of remaining in initial education spell - individuals aged 16-29 in initial education spell.") ///
  ctitle(Continuing student) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/E1a_sample", replace
+
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
 
-****************************************
-*Probability of Returning to education *
-****************************************
-*Process E1b: Probability of being in education. Sample: Individuals aged 16-35 not in continuous education.
-*Or probability of returning to education for those who had left school.
 
-*sample: Individuals aged 16-35 not in continuous education. 
-fre der if (dag>=16 & dag<=35 & ded==0) 
+* Results 
 
-probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=35 & ded==0)  [pweight=dimlwt], vce(robust)
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+* Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/education/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/education/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_education", sheet("E1a") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_education", sheet("E1a") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+
+
+* Labelling 
+
+putexcel A1 = "REGREESOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Dehmf_c3_Medium"
+putexcel A6 = "Dehmf_c3_Low"
+putexcel A7 = "UKC"
+putexcel A8 = "UKD"
+putexcel A9 = "UKE"
+putexcel A10 = "UKF"
+putexcel A11 = "UKG"
+putexcel A12 = "UKH"
+putexcel A13 = "UKJ"
+putexcel A14 = "UKK"
+putexcel A15 = "UKL"
+putexcel A16 = "UKM"
+putexcel A17 = "UKN"
+putexcel A18 = "Year_transformed"
+putexcel A19 = "Y2020"
+putexcel A20 = "Y2021"
+putexcel A21 = "Ethn_Asian"
+putexcel A22 = "Ethn_Black"
+putexcel A23 = "Ethn_Other"
+putexcel A24 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Dehmf_c3_Medium"
+putexcel G1 = "Dehmf_c3_Low"
+putexcel H1 = "UKC"
+putexcel I1 = "UKD"
+putexcel J1 = "UKE"
+putexcel K1 = "UKF"
+putexcel L1 = "UKG" 
+putexcel M1 = "UKH" 
+putexcel N1 = "UKJ" 
+putexcel O1 = "UKK" 
+putexcel P1 = "UKL" 
+putexcel Q1 = "UKM" 
+putexcel R1 = "UKN" 
+putexcel S1 = "Year_transformed" 
+putexcel T1 = "Y2020" 
+putexcel U1 = "Y2021"
+putexcel V1 = "Ethn_Asian" 
+putexcel W1 = "Ethn_Black"
+putexcel X1 = "Ethn_Other"
+putexcel Y1 = "Constant"
+
+	
+* Goodness of fit
+
+putexcel set "$dir_results/reg_education", sheet("Gof") modify
+
+putexcel A3 = "E1a - Remaining in initial education spell", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+
+**********************************************
+* E1b: Probability of Returning to Education *
+**********************************************
+
+* Process E1b: Retraining having previously entered the labour force. 
+* Sample: Individuals aged 16-35 who have left their initial education spell 
+*  			and not a student last year 
+* DV: Return to education 
+
+fre der if (dag >= 16 & dag <= 35 & ded == 0) 
+// 69.3% remain out of education 
+
+/*//////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3  ib8.drgn1 stm y2020 y2021 i.dot ///
+if (dag >= 16 & dag <= 35 & ded==0  & l.der==0)  [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E1b, dimlwt) side dec(4)
+
+probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3  ib8.drgn1 stm y2020 y2021 i.dot ///
+if (dag >= 16 & dag <= 35 & ded==0  & l.der==0)	 [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, disclwt) side dec(4) 
+
+probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3  ib8.drgn1 stm y2020 y2021 i.dot ///
+if (dag >= 16 & dag <= 35 & ded==0  & l.der==0)  [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_E1b.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3  ib8.drgn1 stm y2020 y2021 i.dot ///
+if (dag >= 16 & dag <= 35 & ded==0  & l.der==0) ///
+	 [pweight=dimxwt], vce(robust)
+
+	* save raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/education", sheet("Process E1b") modify
+putexcel set "$dir_raw_results/education/education", sheet("Process E1b") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/E1b.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/education/E1b.doc", replace ///
 title("Process E1b: Probability of returning to education - individuals aged 16-35 not in continuous education.") ///
  ctitle(Returning student) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
+gen in_sample = e(sample)	
 
-********************************************
-*Educational level after leaving education *
-********************************************
-*Process E2: Educational attainment. Sample: Respondents from Process 1a who have left education.
-*Or Level of education for those leaving education.
+predict p
 
-*sample: Individuals aged 16-29 who were in continuous education and left it. 
-fre deh_c3 if (dag>=16 & dag<=29) & l.ded==1 & ded==0
+save "$dir_validation_data/E1b_sample", replace
 
-/*
-mprobit deh_c3 i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if sedcsmpl==1 [pweight=dimxwt], vce(robust)
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/education.xlsx", sheet("Process E2 - Education Level") modify
-putexcel A1 = matrix(results), names nformat(number_d2) 
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	 
+scalar chi2 = e(chi2)
+scalar ll = e(ll)
+	 
+	
+* Results
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
 
-mprobit deh_c3 i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if sedcsmpl==1 [pweight=dimxwt], vce(robust)
-matrix e2=get(VCE)
-matrix list e2
-putexcel set "$dir_data/edu_vcm.xlsx", sheet("Process E2 - Education Level") modify
-putexcel A1 = matrix(e2), names 
 
-//capture log close
-*/
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/education/var_cov", sheet("var_cov") ///
+	replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/education/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_education", sheet("E1b") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_education", sheet("E1b") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+		
+		
+* Labelling 
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Deh_c3_Medium_L1"
+putexcel A6 = "Deh_c3_Low_L1"
+putexcel A7 = "Les_c3_NotEmployed_L1"
+putexcel A8 = "Dnc_L1"
+putexcel A9 = "Dnc02_L1"
+putexcel A10 = "Dehmf_c3_Medium"
+putexcel A11 = "Dehmf_c3_Low"
+putexcel A12 = "UKC"
+putexcel A13 = "UKD"
+putexcel A14 = "UKE"
+putexcel A15 = "UKF"
+putexcel A16 = "UKG"
+putexcel A17 = "UKH"
+putexcel A18 = "UKJ"
+putexcel A19 = "UKK"
+putexcel A20 = "UKL"
+putexcel A21 = "UKM"
+putexcel A22 = "UKN"
+putexcel A23 = "Year_transformed"
+putexcel A24 = "Y2020"
+putexcel A25 = "Y2021"
+putexcel A26 = "Ethn_Asian"
+putexcel A27 = "Ethn_Black"
+putexcel A28 = "Ethn_Other"
+putexcel A29 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Deh_c3_Medium_L1"
+putexcel G1 = "Deh_c3_Low_L1"
+putexcel H1 = "Les_c3_NotEmployed_L1"
+putexcel I1 = "Dnc_L1"
+putexcel J1 = "Dnc02_L1"
+putexcel K1 = "Dehmf_c3_Medium"
+putexcel L1 = "Dehmf_c3_Low"
+putexcel M1 = "UKC"
+putexcel N1 = "UKD"
+putexcel O1 = "UKE"
+putexcel P1 = "UKF"
+putexcel Q1 = "UKG"
+putexcel R1 = "UKH"
+putexcel S1 = "UKJ"
+putexcel T1 = "UKK"
+putexcel U1 = "UKL"
+putexcel V1 = "UKM"
+putexcel W1 = "UKN"
+putexcel X1 = "Year_transformed"
+putexcel Y1 = "Y2020"
+putexcel Z1 = "Y2021"
+putexcel AA1 = "Ethn_Asian"
+putexcel AB1 = "Ethn_Black"
+putexcel AC1 = "Ethn_Other"
+putexcel AD1 = "Constant"
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_education", sheet("Gof") modify
+
+putexcel A8 = "E1b - Returning to education", bold		
+
+putexcel A10 = "Pseudo R-squared" 
+putexcel B10 = r2_p 
+putexcel A11 = "N"
+putexcel B11 = N 
+putexcel E10 = "Chi^2"		
+putexcel F10 = chi2
+putexcel E11 = "Log likelihood"		
+putexcel F11 = ll
+		
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+
+*************************************************
+* E2a Educational Level After Leaving Education *
+*************************************************
+
+* Process E2a: Educational level achieved when leaving the initial spell of 
+* 				education  
+* Sample: Those 16-29 who have left their initial education spell in current 
+* 			year 
+* DV: Education level (3 cat)  
+* Note: Previously tried a multinomial probit, now use a generalised ordered logit 
+
+fre deh_c3 if (dag >= 16 & dag <= 29) & l.ded == 1 & ded == 0
+
+recode deh_c3 (1 = 3) (3 = 1), gen(deh_c3_recoded)	
+lab def deh_c3_recoded 1 "Low" 2 "Medium" 3 "High"
+lab val deh_c3_recoded deh_c3_recoded
+
+
+/* Model specification tests 
+
+local model_specification_test=0 
+
+if `model_specification_test' == 0 {
+
+	* Option 1 - Ordered logit  
+
+	* Testing the parallel lines assumption 
+	* 	- the model asssumes that coefs (apart for the constant) when estimating  
+	* 		a series of binary probits for 1 vs higher, 1&2 vs higher, 1&2&3 vs 
+	* 		higher
+	*	- Brant test null: the slope coefficients are the same across response  
+	* 		all categories (p<0.05 -> violating the prop odds assumption)
+
+	sort idperson swv
+
+
+	ologit deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+		dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 ///
+		[pweight = dimxwt], vce(robust)
+	 
+	oparallel, ic /*note: all tests have very high Chi2 statistics with p-values of 0.000.the parallel lines assumption is violated.*/
+ 
+ 
+	* Option 2 - Linear model 
+
+	xtset idperson swv
+
+	reg deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+		dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust)
+
+
+	// obtain distribution of predicted values plot 
+	// make sure to add in sampling variance
+	gen in_sample = e(sample)
+
+	scalar sigma = e(rmse)
+	gen epsilon = rnormal()*sigma
+	sum epsilon 
+	predict pred_edu if in_sample == 1
+	replace pred_edu = pred_edu + epsilon if in_sample == 1
+
+	twoway (hist deh_c3_recoded if in_sample == 1 , lcolor(gs12) ///
+		fcolor(gs12)) (hist pred_edu if in_sample == 1 , ///
+		fcolor(none) lcolor(red)), xtitle (Education level) ///
+		legend(lab(1 "Observed") lab( 2 "Predicted")) name(levels, replace) ///
+		graphregion(color(white))
 
+	drop in_sample pred_edu epsilon
 
-/*******************************************************************************
-* Ordered probit model to replace multinomial probit E2a
-*******************************************************************************/
+	sort idperson swv
+ 
+ 
+	* Option 3 - Generalized ordered logit  
+	
+	gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+		dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit 
+	// does the	model produce any negative probabilities? 
+	// if so, 
+	//	1 - play around with the controls 
+	//  2 - consider in the simulation converting the negative probabilities 
+	//		to be zero and rescaling the cdf to sum to 1
+	 
+}
+*/
 
-*1. Recode education level (outcome variable) so 1 = Low education, 2 = Medium education, 3 = High education
-recode deh_c3 ///
-	(1 = 3) ///
-	(3 = 1) ///
-	, gen(deh_c3_recoded)
+* Generalized ordered logit 
+sort idperson swv
+/*
+//////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+	dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight=dimlwt], vce(robust) autofit 
+outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E2a, dimlwt) side dec(4) 
+
+gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+	dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = disclwt], vce(robust) autofit 
+outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E2a, disclwt) side dec(4) 
+
+gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+	dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit 
+outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E2a, dimxwt) side dec(4)  
+erase "${weight_checks}/weight_comparison_E2a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+gologit2 deh_c3_recoded i.Dgn Dag Dag_sq ///
+         i.Dehmf_c3_Medium i.Dehmf_c3_Low ///
+         i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN ///
+         Year_transformed Y2020 Y2021 ///
+         i.Ethn_Asian i.Ethn_Black i.Ethn_Other ///
+if dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit 
 	
-la def deh_c3_recoded 1 "Low" 2 "Medium" 3 "High"
-la val deh_c3_recoded deh_c3_recoded
+*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it.
 
-//oprobit deh_c3_recoded i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if (dag>=16 & ded == 0) [pweight=dimxwt], vce(robust)
-oprobit deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=29 & l.ded==1 & ded==0) [pweight=dimxwt], vce(robust)
+	
+ * raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/education", sheet("Process E2a") modify
+putexcel set "$dir_raw_results/education/education", sheet("Process E2a") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/E2a.doc", replace ///
-title("Process E2a: Ordered probit for educational attainment - individuals aged 16-29 exiting education.") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/education/E2a.doc", replace ///
+title("Process E2a: Generalized ordered logit for educational attainment - individuals aged 16-29 who have left initial education spell.") ///
  ctitle(Education attainment) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
+* Save sample inclusion indicator and predicted probabilities		
+gen in_sample = e(sample)
+predict p1 p2 p3 
+	
+* Save sample for later use (internal validation)	
+save "$dir_validation_data/E2a_sample", replace
+
+* Store model summary statistics
+scalar r2_p = e(r2_p) 
+scalar N_sample = e(N)	 
+	
+* Store results in Excel 
+
+* Store estimates in matrices
+matrix b = e(b)	
+matrix V = e(V)
+
+* Raw output 
+putexcel set "$dir_results/reg_education", sheet("E2a_raw") modify
+putexcel A1 = matrix(b'), names //nformat(number_d2) 
+putexcel A1 =  "CATEGORY"
+putexcel B1 =  "REGRESSOR"
+putexcel C1 =  "COEFFICIENT"
+
+* Estimated coefficients 
+scalar no_coefs_all = colsof(b)
+
+* Eliminate rows and columns containing zeros (baseline cats) 
+mata:
+	// Call matrices into mata 
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+    // Eliminate zeros
+	nonzero_b = select(b, keep)
+
+	// Inspect
+	nonzero_b 
+	
+    // Return to Stata
+    st_matrix("nonzero_b", nonzero_b)
+	st_matrix("nonzero_b_flag", keep)
+end	
+
+* Inspect
+matrix list b 
+matrix list nonzero_b
+matrix list nonzero_b_flag
+
+* Save dimensions
+scalar no_nonzero_b = colsof(nonzero_b)
+scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 
+
+* Address repetition of proportional odds covariates
+
+* Generate repetition/unique observation flag
+mata:
+	// Import matrices into mata
+	nonzero_b_mata = st_matrix("nonzero_b")
+	
+	// Generate binary vector =1 if coefficient repeated 
+	n = cols(nonzero_b_mata)
+	repetition_flag = J(n, 1, 0)
+
+	// use tolerance based comparison to avoid precision errors 
+	tol = 1e-8
+
+		for (i = 1; i <= n; i++) {
+			for (j = 1; j <= n; j++) {
+				if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) {
+					repetition_flag[i] = 1
+					break
+				}
+			}
+	}
+	repetition_flag
+
+	// Generate binary vector =1 if coefficient not repeated 
+	unique_flag  = 1 :- repetition_flag
+
+	// Return to Stata
+	st_matrix("repetition_flag", repetition_flag')
+	st_matrix("unique_flag", unique_flag')
+
+end
+
+* Generate vector to multiply the coef vector with to eliminate the 
+* repetitions of coefficients for vars that satify the proportional odds assumptions
+matrix structure_a = J(1,no_nonzero_b_per,1)
+matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b]
+matrix structure = structure_a, structure_b
+
+* Inspect
+matrix list structure_a
+matrix list structure_b
+matrix list structure
+matrix list nonzero_b
+
+* Eliminate repetitions 
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	structure = st_matrix("structure")
+	nonzero_b = st_matrix("nonzero_b")
+	
+	// Convert reptitions into zeros 
+	b_structure = structure :* nonzero_b
+
+	b_structure 
+	
+	// Eliminate zeros 
+	keep = (b_structure :!= 0)
+	
+	nonzero_b_structure = select(b_structure, keep)
+	
+	// Export to Stata
+	st_matrix("b_structure", b_structure)
+	st_matrix("nonzero_b_structure", nonzero_b_structure)
+
+end
+
+matrix list nonzero_b_structure
+
+* Export into Excel 
+putexcel set "$dir_results/reg_education", sheet("E2a") modify
+putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) 
+
+
+
+* Variance-covariance matrix 
+* ELiminate zeros (baseline categories)
+mata:
+    V = st_matrix("V")
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+	// Eliminate zeros 
+    V_trimmed = select(V, keep)
+    V_trimmed = select(V_trimmed', keep)'
+
+	V_trimmed 
+	
+    // Return to Stata
+    st_matrix("var", V_trimmed)
+end			
+
+matrix list var
+
+* Address repetition due to proportional odds being satisfied for some covars
+matrix square_structure_a = J(no_nonzero_b,1,1) * structure
+matrix square_structure_b = square_structure_a'
+
+matrix list square_structure_a
+matrix list square_structure_b
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	
+	// Create structure matrix (0 = eliminate)
+	square_structure_a = st_matrix("square_structure_a")
+	square_structure_b = st_matrix("square_structure_b")
+	
+	// Element-by-element multiplication
+	square_structure = square_structure_a :* square_structure_b 
+	var_structure = square_structure :* var
+	
+	// Eliminate zeros 
+	row_keep = rowsum(abs(var_structure)) :!= 0
+	col_keep = colsum(abs(var_structure)) :!= 0
+
+	nonzero_var_structure = select(select(var_structure, row_keep), col_keep)
+
+	// Return to Stata
+	st_matrix("nonzero_var_structure", nonzero_var_structure)
+end
+
+matrix list nonzero_var_structure
+
+* Export to Excel 
+putexcel set "$dir_results/reg_education", sheet("E2a") modify
+putexcel C2 = matrix(nonzero_var_structure)
+		
+			
+* Labels
+putexcel set "$dir_results/reg_education", sheet("E2a") modify
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+/* Create temporary frame ==> not available in stata 14
+frame create temp_frame
+frame temp_frame: {
+    
+    mata: 
+		// Import matrices from Stata
+		nonzero_b_flag = st_matrix("nonzero_b_flag")'
+		unique_flag = st_matrix("unique_flag")'
+		structure = st_matrix("structure")'
+		stripe = st_matrixcolstripe("e(b)")
+		
+		// Extract variable and category names
+		catnames = stripe[.,1]
+		varnames = stripe[.,2]
+		varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+		catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+		
+		// Create and clean labels 
+		// Address lags
+		labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+		
+		// Add category 
+		labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+		
+		// Remove 1. 
+		labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+		
+		// Constant 
+		labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+					
+		nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+		
+		// Add v1
+		nonzero_labels_structure = "v1"\nonzero_labels_structure
+		
+		// Create temp file with results
+		fh = fopen("$dir_results/temp_labels.txt", "w")
+		for (i=1; i<=rows(nonzero_labels_structure); i++) {
+			fput(fh, nonzero_labels_structure[i])
+		}
+		fclose(fh)
+    end
+ */
+ * Here's a replacement for stata 14: 
+local dir_results "$dir_results"  
+
+preserve
+* Run Mata block
+mata: 
+    // Import matrices from Stata
+    nonzero_b_flag = st_matrix("nonzero_b_flag")'
+    unique_flag = st_matrix("unique_flag")'
+    structure = st_matrix("structure")'
+    stripe = st_matrixcolstripe("e(b)")
+    
+    // Extract variable and category names
+    catnames = stripe[.,1]
+    varnames = stripe[.,2]
+    varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+    catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+    
+    // Handle lags
+    labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+    
+    // Add category name when flag is not unique
+    labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+    
+    // Clean labels
+    labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+    labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+    
+    // Filter for structure == 1
+    nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+    
+    // Add header row
+    nonzero_labels_structure = "v1"\nonzero_labels_structure
+    
+    // Write to temporary file
+    fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w")
+    for (i=1; i<=rows(nonzero_labels_structure); i++) {
+        fput(fh, nonzero_labels_structure[i])
+    }
+    fclose(fh)
+end
+
+    * Import cleaned labels into Stata as new dataset
+    import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8)
+	gen n = _n
+    
+    * Export labels to Excel
+    putexcel set "$dir_results/reg_education", sheet("E2a") modify
+	
+	* Vertical labels
+    sum n, meanonly
+	local N = r(max)+1
+	
+	forvalue i = 2/`N' {
+		local j = `i' - 1
+		putexcel A`i' = v1[`j'] 
+	}
+	
+	* Horizontal labels
+	sum n, meanonly
+	local N = r(max) + 1  // Adjusted since we're working across columns
+
+	forvalues j = 1/`N' {
+		local n = `j'+2 // Shift by 2 to start from column C
+		local col ""
+		
+		while `n' > 0 {
+			local rem = mod(`n' - 1, 26)
+			local col = char(65 + `rem') + "`col'"
+			local n = floor((`n' - 1)/26)
+		}
+
+		putexcel `col'1 = v1[`j']
+	}	
+		
+    *Clean up
+    erase "$dir_results/temp_labels.txt"
+
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_education", sheet("Gof") modify
+
+putexcel A13 = "E2a - Education attainment, not in initial education spell", bold		
+
+putexcel A15 = "Pseudo R-squared" 
+putexcel B15 = r2_p 
+putexcel A16 = "N"
+putexcel B16 = N_sample
+
+restore		
+* Clean up 		
+drop in_sample p1 p2 p3
+scalar drop _all
+matrix drop _all
+//frame drop temp_frame 	
+
 
 capture log close
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do
index 9ae5d272d..1ae2a996e 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do
@@ -3,7 +3,11 @@
 * SECTION:			Fertility
 * OBJECT: 			Final Probit Models
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* LAST UPDATE:		1 July 2025 DP  
+* COUNTRY: 			UK 
+*
+* NOTES:			    Simplified the fertility process for those in this initial 
+* 						education spell.  
 ********************************************************************************
 clear all
 set more off
@@ -12,116 +16,415 @@ set type double
 //set maxvar 120000
 set maxvar 30000
 
+*******************************************************************
+cap log close 
+log using "${dir_log}/reg_fertility.log", replace
+*******************************************************************
+use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+do "$dir_do/variable_update"
 
-* Directory which contains do files
-global dir_do "${dir_work}/do"
 
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
+* sample selection 
+drop if dag < 16
 
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
 
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
+* Set Excel file 
 
+* Info sheet
 
-*******************************************************************
-cap log close 
-log using "${dir_log}/reg_fertility.log", replace
-*******************************************************************
-use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
+putexcel set "$dir_results/reg_fertility", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing projection of fertility"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
 
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "F1a"
+putexcel B5 = "Probit regression estimates of the probability of  having a child for women aged 18-44 in initial education spell"
+putexcel A6 = "F1b"
+putexcel B6 = "Probit regression estimates of probability of having a child for women aged 18-44 not in initial education spell"
 
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children" 
-			
-label define gdr 1  "Male" 0 "Female"
-				
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable dhhtp_c4 "Household Type: 4 Category"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable dhe "Self-rated Health"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable dnc "Number of Children in Household"
-label variable dnc02 "Number of Children aged 0-2 in Household"
-label variable ydses_c5 "Annual Household Income Quintile" 
-label variable dukfr "UK Fertility Rate"
-
-label value dgn gdr
-label value drgn1 rgna
-label value dhhtp_c4 hht 
-label value les_c3 jbf 
-label value deh_c3 edd 
-label value ded yn
+putexcel A10 = "Notes:", bold
+putexcel B10 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot), covid dummies (y2020 y2021)"
+putexcel B11 = "F1a: only 24 obs having a child when in initial education spell, therefore have to take away some covariates to obtain estimate"
 
-drop if dag < 16
-replace stm = stm - 2000
 
-/*check if all covariates are available in the data*/ 
-recode dhe dnc dnc02 deh_c3 les_c3 ydses_c5 dcpst drgn1 sprfm scedsmpl dukfr    (-9=. )
-recode dchpd (-9=0)
+putexcel set "$dir_results/reg_fertility", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold		
 
 xtset idperson swv
 
+**********************************************
+* F1a - Having a child, in initial edu spell * 
+**********************************************
+
+* Process F1a: Probabiltiy of having a child 
+* Sample: Women aged 18-44, in initial education spell education.
+* DV: New born child dummy (note that in the estimation sample dchpd contains the number of newborn children, which could be >1) 
+
+replace dchpd=1 if dchpd>1 & dchpd<. 
+// only 69 ppl meet the condition in total
+tab dchpd if (sprfm == 1 & ded == 1) 
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if ///
+    sprfm == 1 & ded == 1 [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(F1a, dimlwt) side dec(4) 
 
-**********************************************************************
-*Proces F1a - Probability of Having a Child - In continuous education
-**********************************************************************
-*Sample: Women aged 18-44 not in continuous education.
-probit dchpd dag l.dnc il.dnc02 ib1.dcpst if (sprfm==1 & scedsmpl==1) [pweight=disclwt], vce(robust)
+probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if ///
+    sprfm == 1 & ded == 1 [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1a, disclwt) side dec(4)
+
+probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if ///
+    sprfm == 1 & ded == 1 [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1a, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_F1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+
+probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if ///
+    sprfm == 1 & ded == 1 [pweight=dimxwt], vce(robust)
+
+
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/Fertility_w", sheet("Process F1a - In education") replace
+putexcel set "$dir_raw_results/fertility/fertility", sheet("Process F1a - In education") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/F1a.doc", replace ///
-title("Process F1a: Probability of giving birth to a child. Sample: Women aged 18-44 in continuous education.") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/fertility/F1a.doc", replace ///
+title("Process F1a: Probability of giving birth to a child. Sample: Women aged 18-44 in initial education spell.") ///
  ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+	
+
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/F1a_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+
+* Results 	
+* Note: Zeros eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_fertility", sheet("F1a") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_fertility", sheet("F1a") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+	
+* Labelling	
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dag"
+putexcel A3 = "Dcpst_Single"
+putexcel A4 = "Year_transformed"
+putexcel A5 = "Ethn_Black"
+putexcel A6 = "Constant"
 
+putexcel B1 = "COFFICIENT"
+putexcel C1 = "Dag"
+putexcel D1 = "Dcpst_Single"
+putexcel E1 = "Year_transformed"
+putexcel F1 = "Ethn_Black"	
+putexcel G1 = "Constant"	
 
-************************************************************************
-*Proces F1b Probability of Having a Child - Not in continuous education
-*************************************************************************
-*Sample: Women aged 18-44 not in continuous education.
-gen ddnc02 = (dnc02 > 0)
-probit dchpd dag dagsq l.dnc l.ddnc02 ib1.dhe ib1.dcpst dukfr li.les_c3 ib8.drgn1 if (sprfm==1 & scedsmpl==0) [pweight=disclwt], vce(robust)
+	
+* Goodness of fit
 
+putexcel set "$dir_results/reg_fertility", sheet("Gof") modify
+
+putexcel A3 = "F1a - Fertility in initial education spell", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+************************************************
+* F1b - Having a child, left initial edu spell *
+************************************************
+
+* Process F1b: Probabiltiy of having a child 
+* Sample:	Women aged 18-44, left initial education spell
+* DV:	New born child dummy 
+
+tab dchpd if (sprfm == 1 & ded == 0) 
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ ///
+    lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+    (sprfm == 1 & ded == 0) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(F1b, dimlwt) side dec(4) 
+
+probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ ///
+    lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+    (sprfm == 1 & ded == 0) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1b, disclwt) side dec(4)
+
+probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ ///
+    lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+    (sprfm == 1 & ded == 0) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_F1b.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+
+probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ ///
+    lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if ///
+    (sprfm == 1 & ded == 0) [pweight=dimxwt], vce(robust)
+
+	* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/Fertility_w", sheet("Process F1b - Not in education") modify
+putexcel set "$dir_raw_results/fertility/fertility", sheet("Process F1b - Not in education") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-
-outreg2 stats(coef se pval) using "$dir_data/F1b.doc", replace ///
-title("Process F1b: Probability of giving birth to a child. Sample: Women aged 18-44 not in continuous education.") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/fertility/F1b.doc", replace ///
+title("Process F1b: Probability of giving birth to a child. Sample: Women aged 18-44 not in initial education spell.") ///
  ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
+ 
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/F1b_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	 
+scalar chi2 = e(chi2)
+scalar ll = e(ll)
+
+	
+* Results 
+* Note: Zeros eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+* Store variance-covariance matrix 
+
+preserve
 
+putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_fertility", sheet("F1b") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_fertility", sheet("F1b") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 	
  
-capture log close 
+ 
+* Labelling 
+ 
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dag"
+putexcel A3 = "Dag_sq"
+putexcel A4 = "Ydses_c5_Q2_L1"
+putexcel A5 = "Ydses_c5_Q3_L1"
+putexcel A6 = "Ydses_c5_Q4_L1"
+putexcel A7 = "Ydses_c5_Q5_L1"
+putexcel A8 = "Dnc_L1"
+putexcel A9 = "Dnc02_L1"
+putexcel A10 = "Dhe_pcs"
+putexcel A11 = "Dhe_mcs"
+putexcel A12 = "Dcpst_Single_L1"
+putexcel A13 = "Dcpst_PreviouslyPartnered_L1"
+putexcel A14 = "Deh_c3_Medium"
+putexcel A15 = "Deh_c3_Low"
+putexcel A16 = "FertilityRate"
+putexcel A17 = "Les_c3_Student_L1"
+putexcel A18 = "Les_c3_NotEmployed_L1"
+putexcel A19 = "UKC"
+putexcel A20 = "UKD"
+putexcel A21 = "UKE"
+putexcel A22 = "UKF"
+putexcel A23 = "UKG"
+putexcel A24 = "UKH"
+putexcel A25 = "UKJ"
+putexcel A26 = "UKK"
+putexcel A27 = "UKL"
+putexcel A28 = "UKM"
+putexcel A29 = "UKN"
+putexcel A30 = "Year_transformed"
+putexcel A31 = "Y2020"
+putexcel A32 = "Y2021"
+putexcel A33 = "Ethn_Asian"
+putexcel A34 = "Ethn_Black"
+putexcel A35 = "Ethn_Other"
+putexcel A36 = "Constant"
+
+putexcel B1 = "COFFICIENT"
+putexcel C1 = "Dag"
+putexcel D1 = "Dag_sq"
+putexcel E1 = "Ydses_c5_Q2_L1"
+putexcel F1 = "Ydses_c5_Q3_L1"
+putexcel G1 = "Ydses_c5_Q4_L1"
+putexcel H1 = "Ydses_c5_Q5_L1"
+putexcel I1 = "Dnc_L1"
+putexcel J1 = "Dnc02_L1"
+putexcel K1 = "Dhe_pcs"
+putexcel L1 = "Dhe_mcs"
+putexcel M1 = "Dcpst_Single_L1"
+putexcel N1 = "Dcpst_PreviouslyPartnered_L1"
+putexcel O1 = "Deh_c3_Medium"
+putexcel P1 = "Deh_c3_Low"
+putexcel Q1 = "FertilityRate"
+putexcel R1 = "Les_c3_Student_L1"
+putexcel S1 = "Les_c3_NotEmployed_L1"
+putexcel T1 = "UKC"
+putexcel U1 = "UKD"
+putexcel V1 = "UKE"
+putexcel W1 = "UKF"
+putexcel X1 = "UKG"
+putexcel Y1 = "UKH"
+putexcel Z1 = "UKJ"
+putexcel AA1 = "UKK"
+putexcel AB1 = "UKL"
+putexcel AC1 = "UKM"
+putexcel AD1 = "UKN"
+putexcel AE1 = "Year_transformed"
+putexcel AF1 = "Y2020"
+putexcel AG1 = "Y2021"
+putexcel AH1 = "Ethn_Asian"
+putexcel AI1 = "Ethn_Black"
+putexcel AJ1 = "Ethn_Other"
+putexcel AK1 = "Constant"
+
+ 
+* Goodness of fit
 
+putexcel set "$dir_results/reg_fertility", sheet("Gof") modify
 
+putexcel A9 = "F1b - Fertility left initial education spell", bold		
 
+putexcel A11 = "Pseudo R-squared" 
+putexcel B11 = r2_p 
+putexcel A12 = "N"
+putexcel B12 = N 
+putexcel E11 = "Chi^2"		
+putexcel F11 = chi2
+putexcel E12 = "Log likelihood"		
+putexcel F12 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+ 
+ 
+capture log close 
 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do b/input/InitialPopulations/compile/RegressionEstimates/reg_health.do
index 7da6ae616..d75290506 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_health.do
@@ -1,9 +1,12 @@
 ********************************************************************************
 * PROJECT:  		ESPON 
 * SECTION:			Health
-* OBJECT: 			Final Probit and Linear Regression Models - Weighted
+* OBJECT: 			Health status and Disability
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* LAST UPDATE:		1 July 2025 DP  
+* COUNTRY: 			UK 
+*
+* NOTES:			     
 ********************************************************************************
 clear all
 set more off
@@ -17,7 +20,8 @@ set maxvar 30000
 *	DEFINE DIRECTORIES
 *******************************************************************************/
 * Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+//global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates"
 
 * Directory which contains do files
 global dir_do "${dir_work}/do"
@@ -29,137 +33,1032 @@ global dir_data "${dir_work}/data"
 global dir_log "${dir_work}/log"
 
 * Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
+//global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
+global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data"
 
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_health.log", replace
 *******************************************************************
-
 use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
+do "$dir_do/variable_update"
 
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
 
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
+* Sample selection 
+drop if dag < 16
 
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children"
-			
-label define gdr 1  "Male" 0 "Female"
-				
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable dhhtp_c4 "Household Type: 4 Category"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable dhe "Self-rated Health"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable ydses_c5 "Annual Household Income Quintile" 
-label variable dlltsd "Long-term Sick or Disabled"
-
-label value dgn gdr
-label value drgn1 rgna
-label value dhhtp_c4 hht 
-label value les_c3 jbf 
-label value deh_c3 edd 
-label value ded yn
 
+* Set Excel file 
 
-drop if dag < 16
-replace stm = stm - 2000
+* Info sheet
+
+putexcel set "$dir_work/reg_health", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing projection self-reported health status"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+
+putexcel A5 = "H1a"
+putexcel B5 = "Generalized ordered logit regression estimates of self reported health status - individuals aged 16-29 in initial education spell"
+putexcel B6 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table"
+putexcel B7 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name."
+
+putexcel A8 = "H1b"
+putexcel B8 = "Generalized ordered logit regression estimates of self reported health status - individuals aged 16+ not in initial education spell"
+putexcel B9 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table"
+putexcel B10 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name."
+
+putexcel A11 = "H2b"
+putexcel B11 = "Probit regression estimates of the probability of being long-term sick or disabled - people aged 16+ not in initial education spell"
+
+putexcel A12 = "H1a_raw"
+putexcel B12 = "Raw generalized ordered logit regression estimates of self reported health status - individuals aged 16-29 in initial education spell. Useful for the 'Gologit predictor' file."
+putexcel A13 = "H1b_raw"
+putexcel B13 = "Raw generalized ordered logit regression estimates of self reported health status - individuals aged 16+ not in initial education spell. Useful for the 'Gologit predictor' file."
+
+putexcel A15 = "Notes:", bold
+putexcel B15 = "All processes: replaced lagged dhe with lagged dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot), covid dummies (y2020 y2021)"
+putexcel B16 = "H1a and H1b: excluded those with imputed values of dhe"
+putexcel B17 = "H1a: some covariates had to be dropped to obtain estimates; lagged income quintile is treated as continuous variable"
+putexcel B18 = "H2b: used wider definition of disability (Dlltsd01), incl those declaring themselves as disabled or receiving disability benefits"
+
+putexcel set "$dir_work/reg_health", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold	
 
-/*check if all covariates are available in the data*/ 
-recode dhe  deh_c3 les_c3 ydses_c5 dhhtp_c4 drgn1 stm  (0= .) (-9=. ) 
-recode dgn dag dagsq (-9=.)
 
 xtset idperson swv
 
+********************************************
+* H1a: Health status, in initial edu spell *
+********************************************
+
+* Process H1a: Probability of each self-rated health status for those who 
+* 				are in their initial education spell 
+* Sample: 16-29 year olds who are in their initial education spell 
+* DV: Categorical health status (5)	
 
-**********************************
-*Process 1a: Those in education  *
-**********************************
-*
-*Self-rated health status for those in continuous education.
-*sample: 16-29 year olds who have always been in education without a break
 fre dhe if (dag>=16 & dag<=29 & ded==1 )
 
-/*
-regress dhe i.dgn dag dagsq li.ydses_c5 l.dhe ib8.drgn1 stm if scedsmpl==1 [pweight=disclwt], vce(robust)
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/health.xlsx", sheet("Process H1a - Self-rated Health") replace
-putexcel A1 = matrix(results), names nformat(number_d2) 
-putexcel A1 = matrix(results), names nformat(number_d2)
+/* Ordered probit models to replace linear regression 
+oprobit dhe i.dgn dag dagsq li.ydses_c5 ilb5.dhe ib8.drgn1 stm if (dag>=16 & dag<=29 & ded==1) [pweight=disclwt], vce(robust)
 */
 
-* Ordered probit models to replace linear regression 
-oprobit dhe i.dgn dag dagsq li.ydses_c5 ilb5.dhe ib8.drgn1 stm if (dag>=16 & dag<=29 & ded==1) [pweight=disclwt], vce(robust)
+* Generalized ordered logit			
+gologit2 dhe i.Dgn Dag Dag_sq L_Ydses_c5 L_Dhe_pcs L_Dhe_mcs i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN Year_transformed Y2020 Y2021 i.Ethn_Asian i.Ethn_Black i.Ethn_Other ///
+    if dag >= 16 & dag <= 29 & ded == 1 & dhe_flag != 1 ///
+	[pweight = dimxwt], autofit
+*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it.
+
+
+	*raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/health", sheet("Process H1a") replace
-putexcel A3 = matrix(results), names nformat(number_d2) 
+putexcel set "$dir_raw_results/health/health", sheet("Process H1a") replace
+putexcel A3 = matrix(results), names //nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/H1a.doc", replace ///
-title("Process H1a: Ordered probit regression estimates of self reported health status - individuals aged 16-29 in continuous education") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/health/H1a.doc", replace ///
+title("Process H1a: Generalised ordered logit regression estimates of self reported health status - individuals aged 16-29 in continuous education") ///
  ctitle(health status) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+	
+* Save sample inclusion indicator and predicted probabilities		
+gen in_sample = e(sample)
+predict p1 p2 p3 p4 p5
+	
+* Save sample for later use (internal validation)	
+save "$dir_validation_data/H1a_sample", replace
 
+* Store model summary statistics
+scalar r2_p = e(r2_p) 
+scalar N_sample = e(N)	 
+	
+* Store results in Excel 
 
-****************************************
-*Process 1b: Those in not in education *
-****************************************
-*
-*Self-rated health status for those not in continuous education (out of education or returned having left education in the past).
-*sample: 16 or older who are not in continuous education
-fre dhe if (dag>=16 & ded==0 )
-/*
-regress dhe i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 l.dhe lib1.dhhtp_c4 ib8.drgn1 stm if scedsmpl==0 [pweight=disclwt], vce(robust)
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set health, sheet("Process H1b - Not in education") modify
-putexcel A1 = matrix(results), names nformat(number_d2) 
-*/
+* Store estimates in matrices
+matrix b = e(b)	
+matrix V = e(V)
+
+* Raw output 
+putexcel set "$dir_results/reg_health", sheet("H1a_raw") modify
+putexcel A1 = matrix(b'), names //nformat(number_d2) 
+putexcel A1 =  "CATEGORY"
+putexcel B1 =  "REGRESSOR"
+putexcel C1 =  "COEFFICIENT"
+
+* Estimated coefficients 
+scalar no_coefs_all = colsof(b)
+
+* Eliminate rows and columns containing zeros (baseline cats) 
+mata:
+	// Call matrices into mata 
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+    // Eliminate zeros
+	nonzero_b = select(b, keep)
+
+	// Inspect
+	nonzero_b 
+	
+    // Return to Stata
+    st_matrix("nonzero_b", nonzero_b)
+	st_matrix("nonzero_b_flag", keep)
+end	
+
+* Inspect
+matrix list b 
+matrix list nonzero_b
+matrix list nonzero_b_flag
+
+* Save dimensions
+scalar no_nonzero_b = colsof(nonzero_b)
+scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 
+
+* Address repetition of proportional odds covariates
+
+* Generate repetition/unique observation flag
+mata:
+	// Import matrices into mata
+	nonzero_b_mata = st_matrix("nonzero_b")
+	
+	// Generate binary vector =1 if coefficient repeated 
+	n = cols(nonzero_b_mata)
+	repetition_flag = J(n, 1, 0)
+
+	// use tolerance based comparison to avoid precision errors 
+	tol = 1e-8
+
+		for (i = 1; i <= n; i++) {
+			for (j = 1; j <= n; j++) {
+				if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) {
+					repetition_flag[i] = 1
+					break
+				}
+			}
+	}
+	repetition_flag
+
+	// Generate binary vector =1 if coefficient not repeated 
+	unique_flag  = 1 :- repetition_flag
+
+	// Return to Stata
+	st_matrix("repetition_flag", repetition_flag')
+	st_matrix("unique_flag", unique_flag')
+
+end
 
-* Ordered probit models to replace linear regression 
+* Generate vector to multiply the coef vector with to eliminate the 
+* repetitions of coefficients for vars that satify the proportional odds assumptions
+matrix structure_a = J(1,no_nonzero_b_per,1)
+matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b]
+matrix structure = structure_a, structure_b
+
+* Inspect
+matrix list structure_a
+matrix list structure_b
+matrix list structure
+matrix list nonzero_b
+
+* Eliminate repetitions 
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	structure = st_matrix("structure")
+	nonzero_b = st_matrix("nonzero_b")
+	
+	// Convert reptitions into zeros 
+	b_structure = structure :* nonzero_b
+
+	b_structure 
+	
+	// Eliminate zeros 
+	keep = (b_structure :!= 0)
+	
+	nonzero_b_structure = select(b_structure, keep)
+	
+	// Export to Stata
+	st_matrix("b_structure", b_structure)
+	st_matrix("nonzero_b_structure", nonzero_b_structure)
+
+end
+
+matrix list nonzero_b_structure
+
+* Export into Excel 
+putexcel set "$dir_results/reg_health", sheet("H1a") modify
+putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) 
+
+
+
+* Variance-covariance matrix 
+* ELiminate zeros (baseline categories)
+mata:
+    V = st_matrix("V")
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+	// Eliminate zeros 
+    V_trimmed = select(V, keep)
+    V_trimmed = select(V_trimmed', keep)'
+
+	V_trimmed 
+	
+    // Return to Stata
+    st_matrix("var", V_trimmed)
+end			
+
+matrix list var
+
+* Address repetition due to proportional odds being satisfied for some covars
+matrix square_structure_a = J(no_nonzero_b,1,1) * structure
+matrix square_structure_b = square_structure_a'
+
+matrix list square_structure_a
+matrix list square_structure_b
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	
+	// Create structure matrix (0 = eliminate)
+	square_structure_a = st_matrix("square_structure_a")
+	square_structure_b = st_matrix("square_structure_b")
+	
+	// Element-by-element multiplication
+	square_structure = square_structure_a :* square_structure_b 
+	var_structure = square_structure :* var
+	
+	// Eliminate zeros 
+	row_keep = rowsum(abs(var_structure)) :!= 0
+	col_keep = colsum(abs(var_structure)) :!= 0
+
+	nonzero_var_structure = select(select(var_structure, row_keep), col_keep)
+
+	// Return to Stata
+	st_matrix("nonzero_var_structure", nonzero_var_structure)
+end
+
+matrix list nonzero_var_structure
+
+* Export to Excel 
+putexcel set "$dir_results/reg_health", sheet("H1a") modify
+putexcel C2 = matrix(nonzero_var_structure)
+		
+			
+* Labels
+putexcel set "$dir_results/reg_health", sheet("H1a") modify
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+/* Create temporary frame ==> not available in stata 14
+frame create temp_frame
+frame temp_frame: {
+    
+    mata: 
+		// Import matrices from Stata
+		nonzero_b_flag = st_matrix("nonzero_b_flag")'
+		unique_flag = st_matrix("unique_flag")'
+		structure = st_matrix("structure")'
+		stripe = st_matrixcolstripe("e(b)")
+		
+		// Extract variable and category names
+		catnames = stripe[.,1]
+		varnames = stripe[.,2]
+		varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+		catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+		
+		// Create and clean labels 
+		// Address lags
+		labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+		
+		// Add category 
+		labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+		
+		// Remove 1. 
+		labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+		
+		// Constant 
+		labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+					
+		nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+		
+		// Add v1
+		nonzero_labels_structure = "v1"\nonzero_labels_structure
+		
+		// Create temp file with results
+		fh = fopen("$dir_results/temp_labels.txt", "w")
+		for (i=1; i<=rows(nonzero_labels_structure); i++) {
+			fput(fh, nonzero_labels_structure[i])
+		}
+		fclose(fh)
+    end
+ */
+ * Here's a replacement for stata 14: 
+local dir_results "$dir_results"  
+
+preserve
+* Run Mata block
+mata: 
+    // Import matrices from Stata
+    nonzero_b_flag = st_matrix("nonzero_b_flag")'
+    unique_flag = st_matrix("unique_flag")'
+    structure = st_matrix("structure")'
+    stripe = st_matrixcolstripe("e(b)")
+    
+    // Extract variable and category names
+    catnames = stripe[.,1]
+    varnames = stripe[.,2]
+    varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+    catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+    
+    // Handle lags
+    labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+    
+    // Add category name when flag is not unique
+    labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+    
+    // Clean labels
+    labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+    labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+    
+    // Filter for structure == 1
+    nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+    
+    // Add header row
+    nonzero_labels_structure = "v1"\nonzero_labels_structure
+    
+    // Write to temporary file
+    fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w")
+    for (i=1; i<=rows(nonzero_labels_structure); i++) {
+        fput(fh, nonzero_labels_structure[i])
+    }
+    fclose(fh)
+end
+
+    * Import cleaned labels into Stata as new dataset
+    import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8)
+	gen n = _n
+    
+    * Export labels to Excel
+    putexcel set "$dir_results/reg_health", sheet("H1a") modify
+	
+	* Vertical labels
+    sum n, meanonly
+	local N = r(max)+1
+	
+	forvalue i = 2/`N' {
+		local j = `i' - 1
+		putexcel A`i' = v1[`j'] 
+	}
+	
+	* Horizontal labels
+	sum n, meanonly
+	local N = r(max) + 1  // Adjusted since we're working across columns
+
+	forvalues j = 1/`N' {
+		local n = `j'+2 // Shift by 2 to start from column C
+		local col ""
+		
+		while `n' > 0 {
+			local rem = mod(`n' - 1, 26)
+			local col = char(65 + `rem') + "`col'"
+			local n = floor((`n' - 1)/26)
+		}
+
+		putexcel `col'1 = v1[`j']
+	}	
+		
+    *Clean up
+    erase "$dir_results/temp_labels.txt"
+
+
+* Export model fit statistics
+putexcel set "$dir_results/reg_health", sheet("Gof") modify
+
+putexcel A3 = "H1a - Health status, in initial education spell", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N_sample
+
+restore	
+* Clean up 		
+drop in_sample p1 p2 p3 p4 p5 
+scalar drop _all
+matrix drop _all
+//frame drop temp_frame 	
+
+
+******************************************************
+* Process H1b: Health status, left intital edu spell *
+******************************************************
+
+* Process H1b: Probability of each self-rated health status for those who 
+* 				have left their initial education spell 
+* Sample: 16 or older who have left their initial education spell 
+* DV: Categorical health status (5)
+
+/* Ordered probit models to replace linear regression 
 oprobit dhe i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ilb5.dhe lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0)  [pweight=disclwt], vce(robust)
+*/
+
+ * Generalized ordered logit	
+sort idperson swv
+
+gologit2 dhe i.Dgn Dag Dag_sq ///
+i.Deh_c3_Medium i.Deh_c3_Low ///
+	i.L_Les_c3_Student i.L_Les_c3_NotEmployed ///
+	/*L_Ydses_c5*/ i.L_Ydses_c5_Q2 i.L_Ydses_c5_Q3 i.L_Ydses_c5_Q4 i.L_Ydses_c5_Q5 ///
+	L_Dhe_pcs L_Dhe_mcs  ///
+	i.L_Dhhtp_c4_CoupleChildren i.L_Dhhtp_c4_SingleNoChildren i.L_Dhhtp_c4_SingleChildren ///
+	i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN ///
+	Year_transformed Y2020 Y2021 ///
+	i.Ethn_Asian i.Ethn_Black i.Ethn_Other ///
+	if dhe_flag != 1 & ///
+	dag >= 16 & ded == 0 [pweight = dimxwt], autofit
+*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it.
+
+
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/health", sheet("Process H1b") modify
+putexcel set "$dir_raw_results/health/health", sheet("Process H1b") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/H1b.doc", replace ///
-title("Process H1b: Ordered probit regression estimates of self reported health status - individuals aged 16+ not in continuous education") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/health/H1b.doc", replace ///
+title("Process H1b: Generalised Ordered logit regression estimates of self reported health status - individuals aged 16+ not in continuous education") ///
  ctitle(health status) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+	
+* Save sample inclusion indicator and predicted probabilities		
+gen in_sample = e(sample)
+predict p1 p2 p3 p4 p5
+	
+* Save sample for later use (internal validation)	
+save "$dir_validation_data/H1b_sample", replace
+
+* Store model summary statistics
+scalar r2_p = e(r2_p) 
+scalar N_sample = e(N)	 
+	
+* Store results in Excel 
+
+* Store estimates in matrices
+matrix b = e(b)	
+matrix V = e(V)
+
+* Raw output 
+putexcel set "$dir_results/reg_health", sheet("H1b_raw") modify
+putexcel A1 = matrix(b'), names //nformat(number_d2) 
+putexcel A1 =  "CATEGORY"
+putexcel B1 =  "REGRESSOR"
+putexcel C1 =  "COEFFICIENT"
+
+* Estimated coefficients 
+scalar no_coefs_all = colsof(b)
+
+* Eliminate rows and columns containing zeros (baseline cats) 
+mata:
+	// Call matrices into mata 
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+    // Eliminate zeros
+	nonzero_b = select(b, keep)
+
+	// Inspect
+	nonzero_b 
+	
+    // Return to Stata
+    st_matrix("nonzero_b", nonzero_b)
+	st_matrix("nonzero_b_flag", keep)
+end	
+
+* Inspect
+matrix list b 
+matrix list nonzero_b
+matrix list nonzero_b_flag
+
+* Save dimensions
+scalar no_nonzero_b = colsof(nonzero_b)
+scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 
+
+* Address repetition of proportional odds covariates
+
+* Generate repetition/unique observation flag
+mata:
+	// Import matrices into mata
+	nonzero_b_mata = st_matrix("nonzero_b")
+	
+	// Generate binary vector =1 if coefficient repeated 
+	n = cols(nonzero_b_mata)
+	repetition_flag = J(n, 1, 0)
+
+	// use tolerance based comparison to avoid precision errors 
+	tol = 1e-8
+
+		for (i = 1; i <= n; i++) {
+			for (j = 1; j <= n; j++) {
+				if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) {
+					repetition_flag[i] = 1
+					break
+				}
+			}
+	}
+	repetition_flag
+
+	// Generate binary vector =1 if coefficient not repeated 
+	unique_flag  = 1 :- repetition_flag
+
+	// Return to Stata
+	st_matrix("repetition_flag", repetition_flag')
+	st_matrix("unique_flag", unique_flag')
+
+end
+
+* Generate vector to multiply the coef vector with to eliminate the 
+* repetitions of coefficients for vars that satify the proportional odds assumptions
+matrix structure_a = J(1,no_nonzero_b_per,1)
+matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b]
+matrix structure = structure_a, structure_b
+
+* Inspect
+matrix list structure_a
+matrix list structure_b
+matrix list structure
+matrix list nonzero_b
+
+* Eliminate repetitions 
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	structure = st_matrix("structure")
+	nonzero_b = st_matrix("nonzero_b")
+	
+	// Convert reptitions into zeros 
+	b_structure = structure :* nonzero_b
+
+	b_structure 
+	
+	// Eliminate zeros 
+	keep = (b_structure :!= 0)
+	
+	nonzero_b_structure = select(b_structure, keep)
+	
+	// Export to Stata
+	st_matrix("b_structure", b_structure)
+	st_matrix("nonzero_b_structure", nonzero_b_structure)
+
+end
+
+matrix list nonzero_b_structure
+
+* Export into Excel 
+putexcel set "$dir_results/reg_health", sheet("H1b") modify
+putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) 
+
+
+
+* Variance-covariance matrix 
+* ELiminate zeros (baseline categories)
+mata:
+    V = st_matrix("V")
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+	// Eliminate zeros 
+    V_trimmed = select(V, keep)
+    V_trimmed = select(V_trimmed', keep)'
+
+	V_trimmed 
+	
+    // Return to Stata
+    st_matrix("var", V_trimmed)
+end			
+
+matrix list var
+
+* Address repetition due to proportional odds being satisfied for some covars
+matrix square_structure_a = J(no_nonzero_b,1,1) * structure
+matrix square_structure_b = square_structure_a'
+
+matrix list square_structure_a
+matrix list square_structure_b
+mata:
+	// Call matrices into mata 
+	var = st_matrix("var")
+	
+	// Create structure matrix (0 = eliminate)
+	square_structure_a = st_matrix("square_structure_a")
+	square_structure_b = st_matrix("square_structure_b")
+	
+	// Element-by-element multiplication
+	square_structure = square_structure_a :* square_structure_b 
+	var_structure = square_structure :* var
+	
+	// Eliminate zeros 
+	row_keep = rowsum(abs(var_structure)) :!= 0
+	col_keep = colsum(abs(var_structure)) :!= 0
+
+	nonzero_var_structure = select(select(var_structure, row_keep), col_keep)
+
+	// Return to Stata
+	st_matrix("nonzero_var_structure", nonzero_var_structure)
+end
+
+matrix list nonzero_var_structure
+
+* Export to Excel 
+putexcel set "$dir_results/reg_health", sheet("H1b") modify
+putexcel C2 = matrix(nonzero_var_structure)
+		
+			
+* Labels
+putexcel set "$dir_results/reg_health", sheet("H1b") modify
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+/* Create temporary frame ==> not available in stata 14 
+frame create temp_frame
+frame temp_frame: {
+    
+    mata: 
+		// Import matrices from Stata
+		nonzero_b_flag = st_matrix("nonzero_b_flag")'
+		unique_flag = st_matrix("unique_flag")'
+		structure = st_matrix("structure")'
+		stripe = st_matrixcolstripe("e(b)")
+		
+		// Extract variable and category names
+		catnames = stripe[.,1]
+		varnames = stripe[.,2]
+		varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+		catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+		
+		// Create and clean labels 
+		// Address lags
+		labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+		
+		// Add category 
+		labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+		
+		// Remove 1. 
+		labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+		
+		// Constant 
+		labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+					
+		nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+		
+		// Add v1
+		nonzero_labels_structure = "v1"\nonzero_labels_structure
+		
+		// Create temp file with results
+		fh = fopen("$dir_results/temp_labels.txt", "w")
+		for (i=1; i<=rows(nonzero_labels_structure); i++) {
+			fput(fh, nonzero_labels_structure[i])
+		}
+		fclose(fh)
+    end
+ */
+ * Here's a replacement for stata 14: 
+local dir_results "$dir_results"  
+
+preserve
+* Run Mata block
+mata: 
+    // Import matrices from Stata
+    nonzero_b_flag = st_matrix("nonzero_b_flag")'
+    unique_flag = st_matrix("unique_flag")'
+    structure = st_matrix("structure")'
+    stripe = st_matrixcolstripe("e(b)")
+    
+    // Extract variable and category names
+    catnames = stripe[.,1]
+    varnames = stripe[.,2]
+    varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+    catnames_no_bl = select(catnames, nonzero_b_flag :== 1)
+    
+    // Handle lags
+    labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl)
+    
+    // Add category name when flag is not unique
+    labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0))
+    
+    // Clean labels
+    labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1)
+    labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+    
+    // Filter for structure == 1
+    nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1)
+    
+    // Add header row
+    nonzero_labels_structure = "v1"\nonzero_labels_structure
+    
+    // Write to temporary file
+    fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w")
+    for (i=1; i<=rows(nonzero_labels_structure); i++) {
+        fput(fh, nonzero_labels_structure[i])
+    }
+    fclose(fh)
+end
+
+
+
+    * Import cleaned labels into Stata as new dataset
+    import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8)
+	gen n = _n
+    
+    * Export labels to Excel
+    putexcel set "$dir_results/reg_health", sheet("H1b") modify
+	
+	* Vertical labels
+    sum n, meanonly
+	local N = r(max)+1
+	
+	forvalue i = 2/`N' {
+		local j = `i' - 1
+		putexcel A`i' = v1[`j'] 
+	}
+	
+	* Horizontal labels
+	sum n, meanonly
+	local N = r(max) + 1  // Adjusted since we're working across columns
+
+	forvalues j = 1/`N' {
+		local n = `j'+2 // Shift by 2 to start from column C
+		local col ""
+		
+		while `n' > 0 {
+			local rem = mod(`n' - 1, 26)
+			local col = char(65 + `rem') + "`col'"
+			local n = floor((`n' - 1)/26)
+		}
+
+		putexcel `col'1 = v1[`j']
+	}	
+		
+    *Clean up
+    erase "$dir_results/temp_labels.txt"
+
+
+	* Export model fit statistics
+putexcel set "$dir_results/reg_health", sheet("Gof") modify
+
+putexcel A9 = "H1b - Health status, left initial education spell", bold		
+
+putexcel A11 = "Pseudo R-squared" 
+putexcel B11 = r2_p 
+putexcel A12 = "N"
+putexcel B12 = N_sample
+
+restore		
+* Clean up 		
+drop in_sample p1 p2 p3 p4 p5 
+scalar drop _all
+matrix drop _all
+//frame drop temp_frame 	
 
  
-**********************************************************************************************
-*Process 2b: Probability of being long-term sick or disabled amongst those not in education  *
-**********************************************************************************************
-*
-*Probability of becoming long-term sick or disabled for those not in continuous education.
-*sample: 16 or older who are not in continuous education
-fre dhe if (dag>=16 & ded==0 )
 
-probit dlltsd i.dgn dag dagsq ib1.deh_c3 li.ydses_c5 ib5.dhe ilb5.dhe l.dlltsd lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0 & dag<56) [pweight=disclwt], vce(robust)
+***********************************************************
+* H2b: Long-term sick or disabled, left initial edu spell *
+***********************************************************
+
+* Process H2b: Probability of being long-term sick or disabled for those 
+* 				not in continuous education.
+* Sample: 16 or older who have left their initial education spell 
+* DV: Long term sick/disabled dummy ==> plus those on disability benefits 
+tab2 dlltsd dlltsd01
+
+fre dlltsd if (dag >= 16 & ded == 0)
+fre dlltsd01 if (dag >= 16 & ded == 0)
+fre les* if dlltsd01==1 
+/*fre les* if dlltsd01==1
+les_c4 -- LABOUR MARKET: Activity status
+---------------------------------------------------------------------------------
+                                    |      Freq.    Percent      Valid       Cum.
+------------------------------------+--------------------------------------------
+Valid   1 Employed or self-employed |       5549      11.46      11.47      11.47
+        2 Student                   |        646       1.33       1.34      12.81
+        3 Not employed              |      24806      51.25      51.28      64.09
+        4 Retired                   |      17368      35.88      35.91     100.00
+        Total                       |      48369      99.93     100.00           
+Missing .                           |         32       0.07                      
+Total                               |      48401     100.00                      
+---------------------------------------------------------------------------------
+*/
+
+/*probit dlltsd01 i.dgn dag dagsq ib1.deh_c3 li.ydses_c5 ib5.dhe ilb5.dhe l.dlltsd lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0) [pweight=disclwt], vce(robust) */
+
+probit dlltsd01 i.Dgn Dag Dag_sq ///
+       i.Deh_c3_Medium i.Deh_c3_Low ///
+	   li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 ///
+	   Dhe_pcs Dhe_mcs ///
+	   L_Dhe_pcs L_Dhe_mcs ///
+	   l.Dlltsd01 ///
+	   li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// 
+	   i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN ///
+	   Year_transformed Y2020 Y2021 ///
+	   i.Ethn_Asian i.Ethn_Black i.Ethn_Other ///
+if (dag >= 16 & ded == 0) ///
+  [pweight = dimxwt], vce(robust)
+
+
+  
+	* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/health", sheet("Process H2b") modify
+putexcel set "$dir_raw_results/health/health", sheet("Process H2b") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/H2b.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/health/H2b.doc", replace ///
 title("Process H2b: Probit regression estimates for being long-term sick or disabled - people aged 16+ not in continuous education") ///
  ctitle(long-term sick or disabled) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
+gen in_sample = e(sample)	
 
+predict p 
  
+* Save sample for later use (internal validation)
+save "$dir_validation_data/H2b_sample", replace
+
+* Store model summary statistics	
+scalar r2_p = e(r2_p) 
+scalar N_sample = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+* Store results in Excel 
+
+* Store estimates in matrices
+matrix b = e(b)	
+matrix V = e(V)
+
+* Eliminate rows and columns containing zeros (baseline cats) 
+mata:
+	// Call matrices into mata 
+    V = st_matrix("V")
+    b = st_matrix("b")
+
+    // Find which coefficients are nonzero
+    keep = (b :!= 0)
+	
+	// Eliminate zeros
+	b_trimmed = select(b, keep)
+    V_trimmed = select(V, keep)
+    V_trimmed = select(V_trimmed', keep)'
+
+	// Inspection
+	b_trimmed 
+	V_trimmed 
+	
+    // Return to Stata
+    st_matrix("b_trimmed", b_trimmed')
+    st_matrix("V_trimmed", V_trimmed)
+	st_matrix("nonzero_b_flag", keep)
+end	
+
+* Export into Excel 
+putexcel set "$dir_results/reg_health", sheet("H2b") modify
+putexcel B2 = matrix(b_trimmed)
+putexcel C2 = matrix(V_trimmed)
+
+
+* Labels 
+putexcel set "$dir_results/reg_health", sheet("H2b") modify
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+
+/* Use frame and Mata to extract nice labels from colstripe of e(b) ==> not working in stata 14
+frame create temp_frame
+frame temp_frame: {
+
+    mata: 
+		// Import matrices from Stata
+		nonzero_b_flag = st_matrix("nonzero_b_flag")'
+		stripe = st_matrixcolstripe("e(b)")
+		
+		// Extract and variable and category names
+		varnames = stripe[.,2]
+		varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+		
+		// Create label vector
+		labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1)
+		labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+		labels_no_bl = regexm(labels_no_bl, "^L\.") :* (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl)
+		labels_no_bl = regexm(labels_no_bl, "^1L.") :* (regexr(labels_no_bl, "^1L.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "1L.") :* labels_no_bl)
+		labels_no_bl = regexr(labels_no_bl, "_Dgn_L1$", "_Dgn")
+		
+		labels_no_bl
+		
+		nonzero_labels_structure = "v1"\labels_no_bl
+		
+		// Create temp file 
+		fh = fopen("$dir_results/temp_labels.txt", "w")
+		for (i=1; i<=rows(nonzero_labels_structure); i++) {
+			fput(fh, nonzero_labels_structure[i])
+		}
+		fclose(fh)
+    end
+*/
+* STATA 14-COMPATIBLE LABEL EXTRACTION AND FILE EXPORT 
+* Mata: extract and clean labels
+mata: 
+    // Import matrices
+    nonzero_b_flag = st_matrix("nonzero_b_flag")'
+    stripe = st_matrixcolstripe("e(b)")
+
+    // Extract varnames from stripe (2nd column)
+    varnames = stripe[.,2]
+    varnames_no_bl = select(varnames, nonzero_b_flag :== 1)
+
+    // Clean label vector
+    labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1)
+    labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant")
+    labels_no_bl = regexm(labels_no_bl, "^L\\.") :* (regexr(labels_no_bl, "^L\\.", "") :+ "_L1") :+ ///
+                   (!regexm(labels_no_bl, "^L\\.") :* labels_no_bl)
+    labels_no_bl = regexm(labels_no_bl, "^1L\\.") :* (regexr(labels_no_bl, "^1L\\.", "") :+ "_L1") :+ ///
+                   (!regexm(labels_no_bl, "^1L\\.") :* labels_no_bl)
+    labels_no_bl = regexr(labels_no_bl, "_Dgn_L1$", "_Dgn")
+
+    // Save as macro for writing labels from Stata
+    st_local("nice_labels", invtokens(labels_no_bl'))
+end
+
+* Save cleaned labels into your original file 
+capture file close labelout
+file open labelout using "$dir_results/temp_labels.txt", write replace
+file write labelout "v1" _n  // header for import
+foreach lbl in `nice_labels' {
+    file write labelout "`lbl'" _n
+}
+file close labelout
+
+* Import cleaned labels from your file
+import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8)
+gen n = _n
+
+* Export to Excel (vertical layout in column A)
+putexcel set "$dir_results/reg_health", sheet("H2b") modify
+summarize n, meanonly
+local N = r(max) + 1
+forvalue i = 2/`N' {
+    local j = `i' - 1
+    putexcel A`i' = v1[`j']
+}
+
+* Export to Excel (horizontal layout in row 1, starting at column C)
+forvalues j = 1/`N' {
+    local n = `j' + 2  // shift index: col C = 3
+    local col ""
+    local nn = `n'
+    while `nn' > 0 {
+        local rem = mod(`nn' - 1, 26)
+        local col = char(65 + `rem') + "`col'"
+        local nn = floor((`nn' - 1)/26)
+    }
+    putexcel `col'1 = v1[`j']
+}
+
+* Clean up original file
+erase "$dir_results/temp_labels.txt"
+
+
+* Export model fit statistics	
+putexcel set "$dir_results/reg_health", sheet("Gof") modify
+
+putexcel A15 = "H2b -  Long-term sick/disabled or on disability benefits, left initial edu spell", bold		
+putexcel A17 = "Pseudo R-squared" 
+putexcel B17 = r2_p 
+putexcel A18 = "N"
+putexcel B18 = N_sample  
+putexcel E17 = "Chi^2"		
+putexcel F17 = chi2
+putexcel E18 = "Log likelihood"		
+putexcel F18 = ll		
+	
+* Clean up 		
+//drop in_sample p
+scalar drop _all
+matrix drop _all
+//frame drop temp_frame 	
+
 capture log close 
+
+cap erase "$dir_results/temp.dta"
+	
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do
index 52df228ae..0dd0739ca 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do
@@ -3,7 +3,11 @@
 * SECTION:			Home ownership 
 * OBJECT: 			Final Regresion Models - Weighted
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* LAST UPDATE:		15 May 2025 DP  
+* COUNTRY: 			UK
+*
+* NOTES: 			Removed spousal education to include singles, combined it with hh composition instead, added lagged home ownership as a predictor 
+*                  
 ********************************************************************************
 clear all
 set more off
@@ -12,90 +16,270 @@ set type double
 //set maxvar 120000
 set maxvar 30000
 
+*******************************************************************
+cap log close 
+log using "${dir_log}/reg_home_ownership.log", replace
+*******************************************************************
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-* Directory which contains do files
-global dir_do "${dir_work}/do"
+do "$dir_do/variable_update"
 
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
 
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
+*sample selection 
+drop if dag < 16
 
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
 
+xtset idperson swv
 
-*******************************************************************
-cap log close 
-log using "${dir_log}/reg_home_ownership.log", replace
-*******************************************************************
 
-use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
+* Set Excel file 
 
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-
-label define gdr 1  "Male" 0 "Female"
-			
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 5 Category" 
-label variable dhe "Self-rated Health"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable dhhtp_c4 "Household Type: 4 Category"
-
-label value dgn gdr
-label value drgn1 rgna
-label value les_c3 lessp_c3 jbf 
-label value deh_c3 dehsp_c3 edd 
-label value dcpen dcpex dlrtrd yn
-label value dhhtp_c4 hht
+* Info sheet
 
-drop if dag < 16
-replace stm = stm - 2000
+putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing projection of home ownership"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
 
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "HO1a"
+putexcel B5 = "Probit regression estimates of the probability of being a home owner, aged 18+"
 
-*check if all covariates are available and recode missing values 
-recode dhh_owned dgn dag dagsq les_c3 deh_c3 dhe yptciihs_dv ydses_c5 drgn1 dhhtp_c4 lessp_c3 stm (-9=.)
+putexcel A10 = "Notes:", bold
+putexcel B10 = "Have combined dhhtp_c4 and lessp_c3 into a single variable with 8 categories, dhhtp_c8"
+putexcel B11 = "Added lagged home ownership, replaced dhe with dhe_pcs and dhe_mcs, added ethnicity (dot) and covid dummies (y2020 2021)"
 
-xtset idperson swv
+putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold		
+
+
+************************
+* HO1a: Home ownership *
+************************
+
+* Process HO1a: Probability of being a home owner 
+* Sample: Individuals aged 18+
+* DV: Home ownerhip dummy
+
+fre dhh_owned if dag >= 18
 
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 ///
+i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if ///
+dag>=18 [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(HO1a, dimlwt) side dec(4) 
 
-*************************************************
-*Process HO1: Probability of being a homeowner. *
-*************************************************
-*Sample: Individuals aged 16 and above.
+probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 ///
+i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if ///
+dag>=18 [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(HO1a, disclwt) side dec(4)
 
-probit dhh_owned dgn dag dagsq il.dhhtp_c4 il.les_c3 il.lessp_c3 i.deh_c3 il.dhe il.ydses_c5 l.yptciihs_dv ib8.drgn1 stm if dag>=16 [pweight=disclwt], vce(cluster idperson)
+probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 ///
+i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if ///
+dag>=18 [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(HO1a, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_HO1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////	
+*/	
+	
+probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 ///
+i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if ///
+dag>=18 [pweight=dimxwt], vce(cluster idperson)
+
+
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/homeownership", sheet("Process HO1a") replace
+putexcel set "$dir_raw_results/home_ownership/homeownership", sheet("Process HO1a") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace ///
-title("Process HO1a: Probability of being a home owner - people aged 16+") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/home_ownership/HO1a.doc", replace ///
+title("Process HO1a: Probability of being a home owner - individuals aged 18+") ///
  ctitle(home owner) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/HO1a_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
 
 
+* Results 	
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/home_ownership/var_cov", sheet("var_cov") ///
+	replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/home_ownership/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_home_ownership", sheet("HO1a") modify 
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_home_ownership", sheet("HO1a") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 	
+	
+
+* Labelling 
+ 
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Dhhtp_c8_2_L1"
+putexcel A6 = "Dhhtp_c8_3_L1"
+putexcel A7 = "Dhhtp_c8_4_L1"
+putexcel A8 = "Dhhtp_c8_5_L1"
+putexcel A9 = "Dhhtp_c8_6_L1"
+putexcel A10 = "Dhhtp_c8_7_L1"
+putexcel A11 = "Dhhtp_c8_8_L1"
+putexcel A12 = "Les_c3_Student_L1"
+putexcel A13 = "Les_c3_NotEmployed_L1"
+putexcel A14 = "Deh_c3_Medium"
+putexcel A15 = "Deh_c3_Low"
+putexcel A16 = "Dhe_mcs"
+putexcel A17 = "Dhe_pcs"
+putexcel A18 = "Ydses_c5_Q2_L1"
+putexcel A19 = "Ydses_c5_Q3_L1"
+putexcel A20 = "Ydses_c5_Q4_L1"
+putexcel A21 = "Ydses_c5_Q5_L1"
+putexcel A22 = "Yptciihs_dv_L1"
+putexcel A23 = "Dhh_owned_L1"
+putexcel A24 = "UKC"
+putexcel A25 = "UKD"
+putexcel A26 = "UKE"
+putexcel A27 = "UKF"
+putexcel A28 = "UKG"
+putexcel A29 = "UKH"
+putexcel A30 = "UKJ"
+putexcel A31 = "UKK"
+putexcel A32 = "UKL"
+putexcel A33 = "UKM"
+putexcel A34 = "UKN"
+putexcel A35 = "Year_transformed"
+putexcel A36 = "Y2020"
+putexcel A37 = "Y2021"
+putexcel A38 = "Ethn_Asian"
+putexcel A39 = "Ethn_Black"
+putexcel A40 = "Ethn_Other"
+putexcel A41 = "Constant"
+
+putexcel B1 = "COFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Dhhtp_c8_2_L1"
+putexcel G1 = "Dhhtp_c8_3_L1"
+putexcel H1 = "Dhhtp_c8_4_L1"
+putexcel I1 = "Dhhtp_c8_5_L1"
+putexcel J1 = "Dhhtp_c8_6_L1"
+putexcel K1 = "Dhhtp_c8_7_L1"
+putexcel L1 = "Dhhtp_c8_8_L1"
+putexcel M1 = "Les_c3_Student_L1"
+putexcel N1 = "Les_c3_NotEmployed_L1"
+putexcel O1 = "Deh_c3_Medium"
+putexcel P1 = "Deh_c3_Low"
+putexcel Q1 = "Dhe_mcs"
+putexcel R1 = "Dhe_pcs"
+putexcel S1 = "Ydses_c5_Q2_L1"
+putexcel T1 = "Ydses_c5_Q3_L1"
+putexcel U1 = "Ydses_c5_Q4_L1"
+putexcel V1 = "Ydses_c5_Q5_L1"
+putexcel W1 = "Yptciihs_dv_L1"
+putexcel X1 = "Dhh_owned_L1"
+putexcel Y1 = "UKC"
+putexcel Z1 = "UKD"
+putexcel AA1 = "UKE"
+putexcel AB1 = "UKF"
+putexcel AC1 = "UKG"
+putexcel AD1 = "UKH"
+putexcel AE1 = "UKJ"
+putexcel AF1 = "UKK"
+putexcel AG1 = "UKL"
+putexcel AH1 = "UKM"
+putexcel AI1 = "UKN"
+putexcel AJ1 = "Year_transformed"
+putexcel AK1 = "Y2020"
+putexcel AL1 = "Y2021"
+putexcel AM1 = "Ethn_Asian"
+putexcel AN1 = "Ethn_Black"
+putexcel AO1 = "Ethn_Other"
+putexcel AP1 = "Constant"
+
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify
+
+putexcel A3 = "HO1a - Home ownership", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
 capture log close 
+
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do b/input/InitialPopulations/compile/RegressionEstimates/reg_income.do
index 138e19009..49bf7ce28 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_income.do
@@ -1,9 +1,23 @@
 ********************************************************************************
 * PROJECT:  		ESPON
 * SECTION:			Non-employment/non-benefit income
-* OBJECT: 			Final Regresion Models - Weighted
-* AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* OBJECT: 			Final Regresion Models 
+* AUTHORS:			Patryk Bronka, Daria Popova, Justin van de Ven
+* LAST UPDATE:		3 July 2025 DP  
+* COUNTRY: 			UK
+
+* NOTES: 			 Models for split income variable
+*                    The goal is to split the current non-labour non-benefit income variable into 3 components  
+*                    (capital returns, occupational pension, public pension) and estimate each of them separately, 
+*                    using (if possible) current set of controls. We have decided to abstain from estimating transfers at the moment. 
+*                       
+*                       The income  do file must be run after
+* 						the wage estimates are obtain because they use 
+* 						predicted wages. 
+/*******************************************************************************
+
+
+*******************************************************************************/
 ********************************************************************************
 clear all
 set more off
@@ -17,7 +31,8 @@ set maxvar 30000
 *	DEFINE DIRECTORIES
 *******************************************************************************/
 * Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+//global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
+global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates"
 
 * Directory which contains do files
 global dir_do "${dir_work}/do"
@@ -29,16 +44,15 @@ global dir_data "${dir_work}/data"
 global dir_log "${dir_work}/log"
 
 * Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
+//global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
+global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data"
 
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_income.log", replace
 *******************************************************************
 
-
-import excel "$dir_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real growth index
+import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_gdp") firstrow clear // Import real growth index
 rename Year stm
 rename Value growth
 gen base_val = growth if stm == 2015
@@ -47,370 +61,1103 @@ replace base_val = r(mean)
 replace growth= growth/base_val
 drop base_val
 replace stm = stm - 2000
-save "$dir_data\growth_rates", replace
-
-use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
-
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-			
-label define gdr 1  "Male" 0 "Female"
-			
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-
-label define yn	1 "Yes" 0 "No"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children" 
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable dhhtp_c4 "Household Type: 4 Category"
-label variable dnc "Number of Children in Household"
-label variable dnc02 "Number of Children aged 0-2 in Household"
-label variable dhe "Self-rated Health"
-label variable ydses_c5 "Annual Household Income Quintile" 
-label variable dlltsd "Long-term Sick or Disabled"
-label variable dcpen "Entered a new Partnership"
-label variable dcpex "Partnership dissolution"
-label variable lesdf_c4 "Differntial Employment Status"
-label variable ypnbihs_dv "Personal Non-benefit Gross Income"
-
-gen  ypnbihs_dv_sq =ypnbihs_dv^2 
- 
-label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared"
-label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income"
-
-label value dgn gdr
-label value drgn1 rgna
-label value les_c3 jbf 
-label value deh_c3 edd 
-label value dcpen dcpex yn
-label value lesdf_c4 dces
-label value ded dlltsd yn
-label value dhhtp_c4 hht
+save "$dir_external_data\growth_rates", replace
+
+use "$dir_ukhls_data/ukhls_pooled_all_obs_10.dta", clear //note this is a pooled dataset after Heckman has been estimated  
 
-drop if dag < 16
-//replace stm = stm - 2000
 sort stm
-merge m:1 stm using "$dir_data/growth_rates", keep(3) nogen keepusing(growth)
+merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(growth)
 
+do "$dir_do/variable_update"
 
-/**********************************************************************
-CLEAN UP VARIABLES FOR REGRESSIONS	
-***********************************************************************/
-recode  dgn dag dagsq dhe drgn1 stm scedsmpl deh_c3 les_c3 dhhtp_c4 dhe (-9=.)
-sum yplgrs_dv ypncp ypnoab pred_hourly_wage
+*sample selection 
+drop if dag < 16
 
 xtset idperson swv
-/*
-*****************************************************************
-*Process I1a: Non-employment income - In continuous education   *
-*****************************************************************
-regress yptciihs_dv i.dgn dag dagsq l.dhe l.yptciihs_dv ib8.drgn1 stm if scedsmpl==1 [pweight=disclwt], vce(robust)
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/Income_mdls", sheet("Income - In education") replace
-putexcel A1 = matrix(results), names nformat(number_d2) 
-*predict fittedice
-*histogram fittedice
-*histogram yptciihs_dv
-
-*Getting Variance Covariance Matrix 
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/income_vcm", sheet("Process I1a - In education") replace
-putexcel A1 = matrix(i1a), names
 
-*******************************************************************
-*Process I1b: Non-employment income - Not in continuous education *
-*******************************************************************
-regress yptciihs_dv i.dgn dag dagsq ib1.deh_c3 i.dlrtrd li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.yptciihs_dv ///
-l2.yplgrs_dv l2.yptciihs_dv l3.yplgrs_dv l3.yptciihs_dv ib8.drgn1 stm if scedsmpl==0 [pweight=disclwt], vce(robust)
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/Income_mdls", sheet("Income - Not in education") modify
-putexcel A1 = matrix(results), names nformat(number_d2) 
-*predict fittednice
-*histogram fittednice
-*histogram yptciihs_dv
-
-*Getting Variance Covariance Matrix 
-matrix i1b=get(VCE)
-matrix list i1b
-putexcel set "$dir_data/income_vcm", sheet("Process I1b - Not in education") modify
-putexcel A1 = matrix(i1b), names
-*/
-
-
-/*******************************************************************************
-
-New models for split income variable
-The goal is to split the current non-labour non-benefit income variable into 3 components  
-(capital returns, occupational pension, public pension) and estimate each of them separately, 
-using (if possible) current set of controls. We have decided to abstain from estimating transfers at the moment. 
-
-*******************************************************************************/
-bys swv idhh: gen nwa = _N
-*Replace l.dhe with dhe if aged 16
-gsort +idperson -stm
-bys idperson: carryforward dhe if dag <= 16, replace 
-
-//For those who are 16, L1 of the variables below is missing as they were 15 at the time. Use current value to keep them in the sample. 
-sort idperson swv
-bys idperson: gen dhe_L1 = l.dhe
-replace dhe_L1 = dhe if missing(dhe_L1) //For those who have L1.dhe missing, use current dhe
-
-bys idperson: gen yplgrs_L1 = l.yplgrs_dv
-replace yplgrs_L1 = yplgrs_dv if missing(yplgrs_L1)
-
-bys idperson: gen ypncp_L1 = l.ypncp
-replace ypncp_L1 = ypncp if missing(ypncp_L1)
-
-bys idperson: gen yplgrs_L2 = l2.yplgrs_dv
-replace yplgrs_L2 = yplgrs_dv if missing(yplgrs_L2)
-
-bys idperson: gen ypncp_L2 = l2.ypncp
-replace ypncp_L2 = ypncp if missing(ypncp_L2)
-
-bys idperson: gen dhhtp_c4_L1 = l.dhhtp_c4
-replace dhhtp_c4_L1 = dhhtp_c4 if missing(dhhtp_c4_L1)
-
-bys idperson: gen les_c3_L1 = l.les_c3
-replace les_c3_L1 = les_c3 if missing(les_c3_L1)
 
+* Set Excel file 
+
+* Info sheet
+putexcel set "$dir_results/reg_income", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "This file contains regression estiamtes used by processes I3 (capital income), I4 (private pension, retired last year), I5 (private pension income, not retired last year) "
+putexcel A2 = "Authors:	Patryk Bronka, Justin Van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "Process I3a selection"
+putexcel B5 = "Logit regression estimates of the probability of receiving capital income - aged 16+ in initial education spell"
+putexcel A6 = "Process I3b selection"
+putexcel B6 = "Logit regression estimates of the probability of receiving capital income - aged 16+ not in initial education spell"
+putexcel A7 = "Process I3a amount"
+putexcel B7 = "OLS regression estimates (log) capital income amount - aged 16+ in initial education spell and receive capital income"
+putexcel A8 = "Process I3b amount"
+putexcel B8 = "OLS regression estimates (log) capital income amount - not in initial education spell and receive capital income"
+putexcel A9 = "Process I4b amount"
+putexcel B9 = "OLS regression estimates (log) private pension income - aged 50+ and were retired last year, receive private pension income"
+putexcel A10 = "Process I5a selection"
+putexcel B10 = "Logit regression estimates of the probability of receiving private pension income - aged 50+ and not a student or retired last year"
+putexcel A11 = "Process I5a amount"
+putexcel B11 = "OLS regression estimates (log) private pension income - aged 50+ and not a student or retired last year"
+
+
+putexcel A15 = "Notes:", bold
+putexcel B15 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot) and Covid dummies (y2020 y2021)"
+putexcel B16 = "All processes: reverted to using stm instead of GDP growth"
+putexcel B17 = "All processes for amounts: moved to log transformation"
 
 /**********************************************************************
-SELECTION MODELS FOR CAPITAL INCOME 
+CAPITAL INCOME 
 ***********************************************************************/
 
 *****************************************************************
-*Process I3a selection: Probability of receiving capital income. 
+*I3a selection: Probability of receiving capital income, in initial edu spell 
 *****************************************************************
-*Sample: Individuals aged 16 - 29 who are in continuous education.
-gen receives_ypncp = (ypncp > 0 & !missing(ypncp))
-logit receives_ypncp i.dgn dag dagsq l.dhe l.yplgrs_dv l.ypncp ib8.drgn1 stm if scedsmpl==1 [pweight=dimxwt], vce(cluster idperson) base
+* Sample: All individuals 16+ that are in initial edu spell
+* DV: Receiving capital income dummy
+* Note: Capital income and employment income variables in IHS version 	
+
+logit receives_ypncp i.dgn dag dagsq /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 yplgrs_dv_L1 ypncp_L1 ib8.drgn1 stm y2020 y2021 i.dot ///
+ if ded == 1 & dag >= 16 [pweight=dimxwt], ///
+ vce(cluster idperson) base
 
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I3a_selection E") replace
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I3a_selection E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3a_selection VCE") replace
-putexcel A1 = matrix(i1a), names
-
-outreg2 stats(coef se pval) using "$dir_data/I3a_sel.doc", replace ///
-title("Process I3a selection: Probability of receiving capital income. Sample: Individuals aged 16 - 29 who are in continuous education.") ///
+matrix i3a=get(VCE)
+matrix list i3a
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3a_selection VCE") replace
+putexcel A1 = matrix(i3a), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I3a_sel.doc", replace ///
+title("Process I3a selection: Probability of receiving capital income. Sample: Individuals aged 16+ who are in initial education spell.") ///
 ctitle(Probability of capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
- 
-********************************************************************
-*Process I3b selection: Probability of receiving capital income.
-********************************************************************
-*Sample: Individuals aged 16+ who are not in continuous education.
-
-logit receives_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.ypncp l2.yplgrs_dv ///
-l2.ypncp ib8.drgn1 stm if scedsmpl==0 [pweight=dimxwt], vce(cluster idperson) base
-
+cap drop in_sample
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/I3a_selection_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I3a_selection") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I3a_selection") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+local var_list Dgn Dag Dag_sq Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+	Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant 
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+		
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A3 = ///
+	"I3a selection - Receiving capital income in initial education spell ", ///
+	bold		
+	
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll			
+
+
+*********************************************************************
+* I3b selection: Probability of receiving capital income, not in initial edu spell *
+*********************************************************************
+* Sample: All individuals 16+, not in initial edu spell
+* DV: Receiving capital income dummy
+* Note: Capital income and employment income variables in IHS version 	
+
+logit receives_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 ///
+yplgrs_dv_L1 ypncp_L1 yplgrs_dv_L2 ypncp_L2 ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot ///
+ if ded == 0 [pweight=dimxwt], ///
+ vce(cluster idperson) base
+
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I3b_selection E") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I3b_selection E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3b_selection VCE") modify
-putexcel A1 = matrix(i1a), names
-
-outreg2 stats(coef se pval) using "$dir_data/I3b_sel.doc", replace ///
-title("Process I3b selection: Probability of receiving capital income. Sample: Individuals aged 16+ who are not in continuous education.") ///
+matrix i3b=get(VCE)
+matrix list i3b
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3b_selection VCE") replace
+putexcel A1 = matrix(i3b), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I3b_sel.doc", replace ///
+title("Process I3b selection: Probability of receiving capital income. Sample: Individuals aged who are not in initial education spell.") ///
 ctitle(Probability of capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
-/**********************************************************************/
-********************************************
-*Process I3a: Amount of capital income. 
-********************************************
-*Sample: Individuals aged 16 - 29 who are in continuous education and receive capital income.
-*Using same controls as Cara - use of lags means those observed for the first time are not taken into account
-
-regress ypncp i.dgn dag dagsq l.dhe l.yplgrs_dv l.ypncp ib8.drgn1 stm if scedsmpl==1 & receives_ypncp == 1 [pweight=dimxwt], ///
-vce(cluster idperson) base
+//cap drop in_sample
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/I3b_selection_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I3b_selection") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I3b_selection") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+
+local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low Les_c4_Student_L1 ///
+	Les_c4_NotEmployed_L1 Les_c4_Retired_L1 Dhhtp_c4_CoupleChildren_L1 ///
+	Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 ///
+	Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 Yplgrs_dv_L2 Ypncp_L2 ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+	Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+	
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+		
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A9 = ///
+	"I3b selection - Receiving capital income left initial education spell ", ///
+	bold		
+	
+putexcel A11 = "Pseudo R-squared" 
+putexcel B11 = r2_p 
+putexcel A12 = "N"
+putexcel B12 = N 
+putexcel E11 = "Chi^2"		
+putexcel F11 = chi2
+putexcel E12 = "Log likelihood"		
+putexcel F12 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll		
+
+
+*******************************************************
+* I3a: Amount of capital income, in initial edu spell * 
+*******************************************************
+* Sample: All individuals 16+ that received capital income, in initial education spell
+* DV: IHS of capital income 
+
+regress ln_ypncp i.dgn dag dagsq /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 yplgrs_dv_L1 ypncp_L1 ///
+ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot if dag >= 16 & receives_ypncp == 1 & ded == 1 ///
+	[pweight = dimxwt], vce(cluster idperson) 
+
+* raw results 	
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I3a CapIn E") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I3a_amount E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3a CapIn E VCE") modify
-putexcel A1 = matrix(i1a), names
-
-outreg2 stats(coef se pval) using "$dir_data/I3a.doc", replace ///
-title("Process I3a: Amount of capital income. Sample: Individuals aged 16 - 29 who are in continuous education and receive capital income.") ///
- ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))
-
-*******************************************
-*Process I3b: Amount of capital income. 
-*******************************************
-*Sample: Individuals aged 16+ who are not in continuous education and receive capital income.
-*Using same controls as Cara
-regress ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.ypncp l2.yplgrs_dv l2.ypncp ib8.drgn1 stm ///
- if scedsmpl==0 & receives_ypncp == 1 [pweight=dimxwt], vce(cluster idperson) base
+matrix i3a=get(VCE)
+matrix list i3a
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3a_amount VCE") replace
+putexcel A1 = matrix(i3a), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I3a.doc", replace ///
+title("Process I3a: Amount of capital income. Sample: Individuals aged 16+ who are in initial education spell abd receive capital income.") ///
+ ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))	
+		
+	
+* Save sample inclusion indicator and predicted probabilities	
+gen in_sample = e(sample)	
+predict p 
+gen sigma = e(rmse)
+
+save "$dir_validation_data/I3a_level_sample", replace
+
+scalar r2 = e(r2) 
+scalar N = e(N)		
+scalar rmse= e(rmse)
+
+* Results 
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I3a_amount") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I3a_amount") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 		
+ 	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+local var_list Dgn Dag Dag_sq Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 ///
+UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+		
+* save RMSE
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A6 = ("I3a") B6 = rmse 
+
+
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A15 = ///
+	"I3a level - Receiving capital income in initial education spell ", ///
+	bold		
+	
+putexcel A17 = "R-squared" 
+putexcel B17 = r2 
+putexcel A18 = "N"
+putexcel B18 = N 
+
+drop in_sample p sigma 
+scalar drop r2 N 
+
+
+***********************************************************
+* I3b: Amount of capital income, not in initial edu spell * 
+*********************************************************** 
+* Sample: Individuals aged 16+ who are not in their initial education spell and 
+* 	receive capital income.
+
+regress ln_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 ///
+	yplgrs_dv_L1 ypncp_L1 yplgrs_dv_L2 ypncp_L2 ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot ///
+	if ded == 0 & receives_ypncp == 1 [pweight = dimxwt], ///
+	vce(cluster idperson)
+	
+* raw results 	
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I3b CapIn NiE") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I3b_amount E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
+matrix i3b=get(VCE)
+matrix list i3b
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3b_amount VCE") replace
+putexcel A1 = matrix(i3b), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I3b.doc", replace ///
+title("Process I3b: Amount of capital income. Sample: Individuals aged 16+ who are not in initial education spell abd receive capital income.") ///
+ ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))	
+		
+	
+* Save sample inclusion indicator and predicted probabilities	
+gen in_sample = e(sample)	
+predict p 
+gen sigma = e(rmse)
+
+save "$dir_validation_data/I3b_level_sample", replace
+
+scalar r2 = e(r2) 
+scalar N = e(N)	
+scalar rmse= e(rmse)
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I3b_amount") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I3b_amount") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 		
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low Les_c4_Student_L1 ///
+	Les_c4_NotEmployed_L1 Les_c4_Retired_L1  Dhhtp_c4_CoupleChildren_L1 ///
+	Dhhtp_c4_SingleNoChildren_L1  Dhhtp_c4_SingleChildren_L1 ///
+	Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 Yplgrs_dv_L2 Ypncp_L2 ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+    Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+		
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+	
+* Save RMSE
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A7 = ("I3b") B7 = rmse 
+
+
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A21 = ///
+	"I3b level - Receiving capital income left initial education spell ", ///
+	bold		
+	
+putexcel A23 = "R-squared" 
+putexcel B23 = r2 
+putexcel A24 = "N"
+putexcel B24 = N 
+
+drop in_sample p sigma 
+scalar drop r2 N 
 
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3b CapIn NiE VCE") modify
-putexcel A1 = matrix(i1a), names
-
-outreg2 stats(coef se pval) using "$dir_data/I3b.doc", replace ///
-title("Process I3b: Amount of capital income. Sample: Individuals aged 16+ who are not in continuous education and receive capital income.") ///
-ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))
-
-replace les_c3 = 4 if dlrtrd == 1
-
-label define jbf 4 "Retired", add 
 
 /**********************************************************************
 PRIVATE PENSION INCOME
 ***********************************************************************/
+
 ***************************************************
-*Process I4b: Amount of pension income. 
+*I4b: Amount of pension income. 
 ***************************************************
 *Sample: Retired individuals who were retired in the previous year.
-gen state_pension_age = (dag >= 68)
-gen receives_ypnoab = (ypnoab_lvl > 0 & !missing(ypnoab_lvl))
 
-regress ypnoab dag dagsq ib1.deh_c3 lib1.dhhtp_c4 l.dhe l.ypnoab l2.ypnoab ib8.drgn1 c.growth stm ///
-if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 [pweight=dimxwt], vce(cluster idperson) base
+regress ln_ypnoab i.dgn dag dagsq ib1.deh_c3 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 ///
+ypnoab_L1 ypnoab_L2 ib8.drgn1 stm  /*c.growth*/ y2020 y2021 i.dot ///
+if dag >= 50 & receives_ypnoab & dlrtrd==1 & l.dlrtrd==1 [pweight=dimxwt], ///
+vce(cluster idperson) base
+
 
+* raw results 	
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I4b Pension Next") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I4b_amount E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I4b Pension Next VCE") modify
-putexcel A1 = matrix(i1a), names
-
-outreg2 stats(coef se pval) using "$dir_data/14b.doc", ///
-replace title("Process I4b: Amount of pension income. Sample: Retired individuals who were retired in the previous year.") ///
-ctitle(Retired) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))
+matrix i4b=get(VCE)
+matrix list i4b
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I4b_amount VCE") replace
+putexcel A1 = matrix(i4b), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I4b.doc", replace ///
+title("Process I4b: Amount of private pension income. Sample: Individuals aged 50+ who were retired in the previous year and receive private pension income.") ///
+ ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))	
+				
+	
+* Save sample inclusion indicator and predicted probabilities	
+gen in_sample = e(sample)	
+predict p 
+gen sigma = e(rmse)
+
+save "$dir_validation_data/I4b_level_sample", replace
+
+scalar r2 = e(r2) 
+scalar N = e(N)	
+scalar rmse= e(rmse)
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I4b_amount") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I4b_amount") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 		
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low ///
+	Dhhtp_c4_CoupleChildren_L1 	Dhhtp_c4_SingleNoChildren_L1  Dhhtp_c4_SingleChildren_L1 ///
+	Dhe_pcs_L1 Dhe_mcs_L1 Ypnoab_L1 Ypnoab_L2 ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+    Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+		
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+	
+* Save RMSE
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A8 = ("I4b") B8 = rmse  
+
+
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A26 = ///
+	"I4b level - Receiving private pension income: was retired last year", ///
+	bold		
+	
+putexcel A27 = "R-squared" 
+putexcel B27 = r2 
+putexcel A28 = "N"
+putexcel B28 = N 
+
+drop in_sample p sigma 
+scalar drop r2 N 
 
 
-
-/**********************************************************************
-PRIVATE PENSION INCOME VERSION 2: 
-	-selection equation for recipiency of private pension income
-	-followed by level of private pension income using linear model
-***********************************************************************/
 **************************************************************************
-*Process I5a: Probability of receiving private pension income. 
+*I5a: Probability of receiving private pension income. 
 **************************************************************************
 *Sample: Retired individuals who were not retired in the previous year.
+* DV: Receiving private pension income dummy
 /*
 Estimated on a sample of individuals retired at time t, who were not retired at t-1.
 I.e. this is probability of receiving private pension income upon retirement. 
 */
 
-logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 l.dhe l.pred_hourly_wage ib8.drgn1 c.growth stm ///
-if scedsmpl==0 & dag >= 50 & dlrtrd == 1 & l.les_c3 != 2 & l.les_c3 != 4 [pweight=dimxwt], vce(cluster idperson) base
-
+logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 ///
+l.pred_hourly_wage ib8.drgn1 stm /*c.growth*/  y2020 y2021 i.dot ///
+if dag >= 50 & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 [pweight=dimxwt], ///
+vce(cluster idperson) base
+ 
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I5a Select") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I5a_selection E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
+matrix i5a=get(VCE)
+matrix list i5a
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I5a_selection VCE") replace
+putexcel A1 = matrix(i5a), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I5a_sel.doc", replace ///
+title("Process I5a selection: Probability of receiving capital income. Sample: Individuals aged 50+ who were not retired last year.") ///
+ctitle(Probability receiving capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+
+//cap drop in_sample
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/I5a_selection_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I5a_selection") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I5a_selection") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+
+local var_list Dgn StatePensionAge Deh_c3_Medium Deh_c3_Low ///
+	Les_c4_NotEmployed_L1 ///
+	Dhhtp_c4_CoupleChildren_L1 	Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 ///
+	Dhe_pcs_L1 Dhe_mcs_L1 Hourly_wage_L1  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+	Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+		
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A30 = ///
+	"I5a selection - Receiving private pension income: was not retited last year", ///
+	bold		
+	
+putexcel A32 = "Pseudo R-squared" 
+putexcel B32 = r2_p 
+putexcel A33 = "N"
+putexcel B33 = N 
+putexcel E32 = "Chi^2"		
+putexcel F32 = chi2
+putexcel E33 = "Log likelihood"		
+putexcel F33 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll		
 
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I5a Select") modify
-putexcel A1 = matrix(i1a), names
 
-outreg2 stats(coef se pval) using "$dir_data/I5a.doc", replace ///
-title("Process I5a: Probability of receiving private pension income. Sample: Retired individuals who were not retired in the previous year.") ///
-ctitle(Probability of private pension income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
 
 ****************************************************
-*Process I5b: Amount of private pension income. 
+*I5a: Amount of private pension income. 
 ****************************************************
 *Sample: Retired individuals who were not retired in the previous year and receive private pension income.
-regress ypnoab_lvl i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 l.dhe l.pred_hourly_wage ib8.drgn1 c.growth stm ///
-if scedsmpl==0 & dag >= 50 & dlrtrd == 1 & l.les_c3 != 2 & l.les_c3 != 4 & receives_ypnoab [pweight=dimxwt], vce(cluster idperson) base
-
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I5b Amount") modify
-putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I5b Amount") modify
-putexcel A1 = matrix(i1a), names
 
-outreg2 stats(coef se pval) using "$dir_data/I5b.doc", replace ///
-title("Process I5b: Amount of private pension income. Sample: Retired individuals who were not retired in the previous year and receive private pension income.") ///
-ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))
-
-capture log close 
-
-/*
-********************
-*I6a: selection
-********************
-/*
-
-Processes I6a and I6b are used to estimate private pension income among those continue retirement (retired at t and at t-1), 
-*and have not received private pension income in the previous year
-
-Estimated on a sample of individuals retired at time t
-I.e. this is probability of receiving private pension in retirement, if not received private pension income in the initial population data 
-*/
-
-logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 cl.ypncp l.dhe ib8.drgn1 c.growth stm ///
-if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 & l.receives_ypnoab == 0  [pweight=dimxwt], vce(cluster idperson) base
-
-matrix results = r(table)
-matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I6a Select") modify
-putexcel A1 = matrix(results), names nformat(number_d2)
-
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I6a Select") modify
-putexcel A1 = matrix(i1a), names
-
-***********************************************************************************
-*I6b: amount of private pension income for those receiving private pension income
-***********************************************************************************
-
-regress ypnoab_lvl i.dgn i.state_pension_age ib1.deh_c3 lib1.dhhtp_c4 l.dhe ib8.drgn1 cl.ypncp c.growth stm ///
-if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 & l.receives_ypnoab == 0 & receives_ypnoab == 1 [pweight=dimxwt], vce(cluster idperson) base
+regress ln_ypnoab i.dgn dag dagsq /*i.state_pension_age*/ ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 ///
+l.pred_hourly_wage ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot ///
+if  dag >= 50 & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 & receives_ypnoab [pweight=dimxwt], ///
+vce(cluster idperson) base
 
+* raw results 	
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/uk_income_split", sheet("Process I6b Amount") modify
+putexcel set "$dir_raw_results/income/income_split", sheet("Process I5a_amount E") replace
 putexcel A1 = matrix(results), names nformat(number_d2)
+matrix i5a=get(VCE)
+matrix list i5a
+putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I5a_amount VCE") replace
+putexcel A1 = matrix(i5a), names
+outreg2 stats(coef se pval) using "$dir_raw_results/income/I5a.doc", replace ///
+title("Process I5a: Amount of private pension income. Sample: Individuals aged 50+ who were not retired in the previous year and receive private pension income.") ///
+ ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse))	
+				
+	
+* Save sample inclusion indicator and predicted probabilities	
+gen in_sample = e(sample)	
+predict p 
+gen sigma = e(rmse)
+
+save "$dir_validation_data/I5a_level_sample", replace
+
+scalar r2 = e(r2) 
+scalar N = e(N)	
+scalar rmse= e(rmse)
+
+* Results
+* Note: Zeros values are eliminated 	
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_income", sheet("I5a_amount") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_income", sheet("I5a_amount") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 		
+	
+* Labelling 
+// Need to variable label when add new variable to model. Order matters. 
+local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low ///
+	Les_c4_NotEmployed_L1 Dhhtp_c4_CoupleChildren_L1 Dhhtp_c4_SingleNoChildren_L1  Dhhtp_c4_SingleChildren_L1 ///
+	Dhe_pcs_L1 Dhe_mcs_L1 Hourly_wage_L1 ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN ///
+    Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant
+		
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+	
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+	
+* Save RMSE
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A9 = ("I5a") B9 = rmse  
+
+
+* Goodness of fit
+putexcel set "$dir_results/reg_income", sheet("Gof") modify
+
+putexcel A35 = ///
+	"I5a level - Receiving private pension income: was not retired last year", ///
+	bold		
+	
+putexcel A37 = "R-squared" 
+putexcel B37 = r2 
+putexcel A38 = "N"
+putexcel B38 = N 
+
+drop in_sample p sigma 
+scalar drop r2 N 
+
+
+//end 
 
-matrix i1a=get(VCE)
-matrix list i1a
-putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I6b Amount") modify
-putexcel A1 = matrix(i1a), names
-
-*/
-
+capture log close 
 
+graph drop _all 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do
index 6bbabdde1..3a100c538 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do
@@ -1,10 +1,14 @@
 ********************************************************************************
-* PROJECT:  		INAPP
+* PROJECT:  		ESPON
 * SECTION:			Leaving Parental Home
-* OBJECT: 			Final Probit and Linear Regression Models - Weighted
+* OBJECT: 			Final Probit Regression Model 
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
-********************************************************************************
+* LAST UPDATE:		1 July 2025 DP  
+* COUNTRY: 			UK  
+* 
+* NOTES: 			
+**********************************************************************************
+
 clear all
 set more off
 set mem 200m
@@ -13,25 +17,6 @@ set type double
 set maxvar 30000
 
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
-
-* Directory which contains do files
-global dir_do "${dir_work}/do"
-
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
-
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
-
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
-
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_leaveParentalHome.log", replace
@@ -39,74 +24,237 @@ log using "${dir_log}/reg_leaveParentalHome.log", replace
 
 use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-/*DP: note that the categories in les_c4 used by Cara are different from the ones currently used 
-so the categories in the corresponsing Excel file were updated */
-
-*Labeling and formating variables
-
-label define jbg 1 "Employed" 2 "Student" 3 "Not employed" 4 "Retired"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children"
-			
-label define gdr 1  "Male" 0 "Female"
-				
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable dhhtp_c4 "Household Type: 4 Category"
-label variable stm "Year"
-label variable les_c4 "Employment Status: 4 Category" 
-label variable dhe "Self-rated Health"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable ydses_c5 "Annual Household Income Quintile" 
-label variable dlltsd "Long-term Sick or Disabled"
-
-label value dgn gdr
-label value drgn1 rgna
-label value dhhtp_c4 hht 
-label value les_c4 jbg 
-label value deh_c3 edd 
-label value ded yn
-
+do "$dir_do/variable_update"
 
+* sample selection 
 drop if dag < 16
-replace stm = stm - 2000
+
+ 
+xtset idperson swv
 
 
-/*check if all covariates are available in the data*/ 
-recode dlftphm dgn dag dagsq deh_c3 les_c4 les_c3 ydses_c5 drgn1 stm (-9=.) 
+* Set Excel file 
 
-xtset idperson swv
+* Info sheet
+
+putexcel set "$dir_work/reg_leave_parental_home", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing leaving parental home"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "P1a"
+putexcel B5 = "Probit regression estimates for leaving the parental home - 18+, not in intitial education spell, living with parents in t-1"
+
+putexcel A10 = "Notes:", bold
+putexcel B10 = "Added: ethnicity-4 cat (dot); covid dummies (y2020 y2021); not partnered condition (dcpst != 1) to be consistent with the simulation"
+
+putexcel set "$dir_work/reg_leave_parental_home", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold		
 
 
 ************************************
-*Process LPH1: Leave Parental Home *
+* Process P1a: Leave Parental Home *
 ************************************
-*Process P1a: Probability of leaving the parental home. Sample: All non-student respondents living with a parent.
-*Or Probability of leaving the parental home for those who have left education. (Students stay in the parental home).
 
-*sample: All non-student respondents aged 18+ who lived with a parent at t-1
-fre dlftphm if (ded==0 & dag>=18 & l.dlftphm==0) 
+* Process P1a: Probability of leaving the parental home. 
+* Sample: All respondents living with a parent in t-1, aged 18+, not in initial 
+* 			education spell 
+* DV: Left parental home dummy of those who lived with parents in t-1
+* Note: Added not partnered condition as well to be consistent with the simulation	
+fre dlftphm if (ded == 0 & dag >= 18 & dcpst != 1) //3.65%
+ 
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot ///
+    if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(P1a, dimlwt) side dec(4) 
+
+probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot ///
+    if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(P1a, disclwt) side dec(4)
+
+probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot ///
+    if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(P1a, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_P1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
 
-probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm if (ded==0 & dag>=18 & l.dlftphm==0) [pweight=disclwt], vce(robust)
+probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot ///
+    if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimxwt], vce(robust)	
+	
+	
+	* save raw results 	
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/leave_parent_home", sheet("Process P1a male grads") replace
+putexcel set "$dir_raw_results/leave_parental_home/leave_parental_home", sheet("Process P1a") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/P1a.doc", replace ///
-title("Process P1a: Probability of leaving the parental home. Sample: All non-student respondents living with a parent.") ///
- ctitle(Leave parental home) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+outreg2 stats(coef se pval) using "$dir_raw_results/leave_parental_home/P1a.doc", replace ///
+title("Process P1a: Probability of leaving the parental home. Sample: All respondents living with a parent and not in initial education spell.") ///
+ ctitle(Leave parental home) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))		
+gen in_sample = e(sample)
+
+
+predict p 
+
+save "$dir_validation_data/P1a_sample", replace
+	
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+	
+* Results
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/leave_parental_home/var_cov", sheet("var_cov") ///
+	replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/leave_parental_home/var_cov", sheet("var_cov") ///
+	clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_leave_parental_home", sheet("P1a") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_leave_parental_home", sheet("P1a") modify
+putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) 
+	
+	
+* Labeling 
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Deh_c3_Medium"
+putexcel A6 = "Deh_c3_Low"
+putexcel A7 = "Les_c3_Student_L1"
+putexcel A8 = "Les_c3_NotEmployed_L1"
+putexcel A9 = "Ydses_c5_Q2_L1"
+putexcel A10 = "Ydses_c5_Q3_L1"
+putexcel A11 = "Ydses_c5_Q4_L1"
+putexcel A12 = "Ydses_c5_Q5_L1"
+putexcel A13 = "UKC"
+putexcel A14 = "UKD"
+putexcel A15 = "UKE"
+putexcel A16 = "UKF"
+putexcel A17 = "UKG"
+putexcel A18 = "UKH"
+putexcel A19 = "UKJ"
+putexcel A20 = "UKK"
+putexcel A21 = "UKL"
+putexcel A22 = "UKM"
+putexcel A23 = "UKN"
+putexcel A24 = "Year_transformed"
+putexcel A25 = "Y2020"
+putexcel A26 = "Y2021"
+putexcel A27 = "Ethn_Asian"
+putexcel A28 = "Ethn_Black"
+putexcel A29 = "Ethn_Other"
+putexcel A30 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Deh_c3_Medium"
+putexcel G1 = "Deh_c3_Low"
+putexcel H1 = "Les_c3_Student_L1"
+putexcel I1 = "Les_c3_NotEmployed_L1"
+putexcel J1 = "Ydses_c5_Q2_L1"
+putexcel K1 = "Ydses_c5_Q3_L1"
+putexcel L1 = "Ydses_c5_Q4_L1"
+putexcel M1 = "Ydses_c5_Q5_L1"
+putexcel N1 = "UKC"
+putexcel O1 = "UKD"
+putexcel P1 = "UKE"
+putexcel Q1 = "UKF"
+putexcel R1 = "UKG"
+putexcel S1 = "UKH"
+putexcel T1 = "UKJ"
+putexcel U1 = "UKK"
+putexcel V1 = "UKL"
+putexcel W1 = "UKM"
+putexcel X1 = "UKN"
+putexcel Y1 = "Year_transformed"
+putexcel Z1 = "Y2020"
+putexcel AA1 = "Y2021"
+putexcel AB1 = "Ethn_Asian"
+putexcel AC1 = "Ethn_Black"
+putexcel AD1 = "Ethn_Other"
+putexcel AE1 = "Constant"
+
+	
+* Goodness of fit 
+
+putexcel set "$dir_results/reg_leave_parental_home", sheet("Gof") modify
+
+putexcel A3 = "P1a - Leaving parental home", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
 
+drop in_sample p
+scalar drop r2_p N chi2 ll	
  
 capture log close 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do
index 55b7dbece..e67f621ba 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do
@@ -3,7 +3,13 @@
 * SECTION:			Unions
 * OBJECT: 			Final Probit Models
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* LAST UPDATE:		1 July 2025 DP  
+* COUNTRY: 			UK  
+* 
+*NOTES: 			
+*                    
+* 					Reduced number of covariates in union formation process 
+*                   for those in initial education spell to obtain estimaes. 	
 ********************************************************************************
 clear all
 set more off
@@ -13,25 +19,6 @@ set type double
 set maxvar 30000
 
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
-
-* Directory which contains do files
-global dir_do "${dir_work}/do"
-
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
-
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
-
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
-
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_partnership.log", replace
@@ -39,117 +26,649 @@ log using "${dir_log}/reg_partnership.log", replace
 
 use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-cap gen ypnbihs_dv_sq = ypnbihs_dv^2
-
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-
-label define gdr 1  "Male" 0 "Female"
-			
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label define dces 1 "Both Employed" 2 "Employed, Spouse Not Employed" 3 "Not Employed, Spouse Employed" 4 "Both Not Employed"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable dhe "Self-rated Health"
-label variable dcpen "Entered a new Partnership"
-label variable dcpex "Partnership dissolution"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable dnc "Number of Children in Household"
-label variable dnc02 "Number of Children aged 0-2 in Household"
-label variable ydses_c5 "Gross Annual Household Income Quintile" 
-label variable lesdf_c4 "Differential Employment Status"
-label variable ypnbihs_dv "Personal Non-benefit Gross Income"
-label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared"
-label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income"
-label variable dhhtp_c4 "Household Type: 4 Category"
-
-label value dgn gdr
-label value drgn1 rgna
-label value les_c3 lessp_c3 jbf 
-label value deh_c3 dehsp_c3 edd 
-label value dcpen dcpex yn
-label value lesdf_c4 dces
-label value dhhtp_c4 hht
+do "$dir_do/variable_update"
+
 
+
+*sample selection 
 drop if dag < 16
-replace stm = stm - 2000
 
-/*check if all covariates are available in the data*/ 
-recode dcpen dgn dag dagsq ydses_c5 dnc dnc02 dhe deh_c3 dehsp_c3 les_c3 ///
-ypnbihs_dv ypnbihs_dv_sq dnc dnc02 dhe dhesp ynbcpdf_dv dcpyy dcpagdf dhhtp_c4 lesdf_c4 ///
-drgn1 stm  (-9=. ) 
 
 xtset idperson swv
 
+* Set Excel file 
+
+* Info sheet
+
+putexcel set "$dir_results/reg_partnership", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters for relationship status projection"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "U1a"
+putexcel B5 = "Probit regression estimates  probability of entering  a partnership - single respondents aged 18+ in initial education spell"
+putexcel A6 = "U1b"
+putexcel B6 = "Probit regression estimates of probability of entering a partnership - single respondents aged 18+ not in initial education spell"
+putexcel A7 = "U2b"
+putexcel B7 = "Probit regression estimates of probability of exiting a partnership - cohabiting women aged 18+ not in initial education spell"
+
+putexcel A10 = "Notes:", bold
+putexcel B10 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot) and Covid dummies (y2020 y2021)"
+putexcel B11 = "U1a: Just 73 obs with positive outcome! Cannot include region and covid dummies as covariates. Cannot obtain estimates of the 5th quintile of hh income"
+putexcel B12 = "U2b contains a new variable New_rel_L1"
+
+putexcel set "$dir_results/reg_partnership", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold		
 
-***************************************************************
-*Process U1a: Entering a partnership - In continuous education *
-***************************************************************
-*Probability of entering a partnership. 
-*Sample: All single respondents aged 18 and older, in continuous education.
-fre dcpen if (dag>=18 & ded==1 & ssscp!=1) //exclude same sex couples
+****************************************************
+* U1a: Partnership formation, in initial edu spell *
+****************************************************
+* Probability of entering a partnership. 
+* Sample: All single respondents aged 18 +, in continuous education.
+* DV: Enter partnership dummy 
+* Note: Requirement of being single in the previous year is embedded in the 
+* 			dependent variable  
+* 		Only 73 observation of relationships forming when still in initial 
+* 			education spell and aged 18+.
+ 
+fre dcpen if (dag >= 18 & ded == 1 & ssscp != 1) 
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U1a, dimlwt) side dec(4) 
+
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag>=18 & ded==1 & ssscp!=1) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1a, disclwt) side dec(4)
+
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1a, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_U1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
 
-probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 i.dhe ib8.drgn1 stm if (dag>=16 & ded==1 & ssscp!=1) [pweight=disclwt], vce(robust)
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimxwt], vce(robust)
+ 
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/union", sheet("Process U1a") replace
+putexcel set "$dir_raw_results/partnership/partnership", sheet("U1a") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace ///
-title("Process U1a: Probit regression estimates for entering a partnership - single respondents aged 16+ in continuous education") ///
+outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U1a.doc", replace ///
+title("Process U1a: Probit regression estimates for entering a partnership - single respondents aged 18+ in continuous education") ///
  ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+ 
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/U1a_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+
+		
+* Results		
+* Note: Zeros values are eliminated 
+
+matrix b = e(b)	
+matrix V = e(V)
+
+
+* Store variance-covariance matrix 
 
+preserve
+
+putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_partnership", sheet("U1a") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_partnership", sheet("U1a") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+
+* Labelling
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Ydses_c5_Q2_L1"
+putexcel A6 = "Ydses_c5_Q3_L1"
+putexcel A7 = "Ydses_c5_Q4_L1"
+putexcel A8 = "Dnc_L1"
+putexcel A9 = "Dnc02_L1"
+putexcel A10 = "Dhe_pcs"
+putexcel A11 = "Dhe_mcs"
+putexcel A12 = "Year_transformed"
+putexcel A13 = "Ethn_Asian"
+putexcel A14 = "Ethn_Black"
+putexcel A15 = "Ethn_Other"
+putexcel A16 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Ydses_c5_Q2_L1"
+putexcel G1 = "Ydses_c5_Q3_L1"
+putexcel H1 = "Ydses_c5_Q4_L1"
+putexcel I1 = "Dnc_L1"
+putexcel J1 = "Dnc02_L1"
+putexcel K1 = "Dhe_pcs"
+putexcel L1 = "Dhe_mcs"
+putexcel M1 = "Year_transformed"
+putexcel N1 = "Ethn_Asian"
+putexcel O1 = "Ethn_Black"
+putexcel P1 = "Ethn_Other"
+putexcel Q1 = "Constant"
  
-********************************************************************
-*Process U1b: Entering a partnership - Not in continuous education  *
-********************************************************************
-*Probability of entering a partnership. 
-*Sample: All respondents aged 18+ who were not in a parthership at t-1 and were not in continuous education
-fre dcpen if (dag>=18 & ded==0 & ssscp!=1) //exclude same sex couples
-
-probit dcpen i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 l.dnc l.dnc02 i.dhe ib8.drgn1 stm if (dag>=18 & ded==0 & ssscp!=1) [pweight=disclwt], vce(robust)
+* Goodness of fit
+
+putexcel set "$dir_results/reg_partnership", sheet("Gof") modify
+
+putexcel A3 = "U1a - Partnership formation, in initial education spell", ///
+	bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+
+********************************************************
+* U1b: Partnership formation, not in initial edu spell *
+********************************************************
+* Process U1b: Probability of entering a partnership. 
+* Sample: All respondents aged 18+, left initial education spell and not in a 
+* 			same sex relationship 
+* DV: Enter partnership dummy (requires not having been in a relationship last 
+* 		year)	
+* Note: Requirement of being single in the previous year is embedded in the 
+* 			dependent variable  
+* 		Income captured by hh quintiles. 
+
+fre dcpen if (dag >= 18 & ded == 0 & ssscp != 1)
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U1b, dimlwt) side dec(4) 
+
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1b, disclwt) side dec(4)
+
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot ///
+if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_U1b.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+
+probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs ib8.drgn1 stm y2020 y2021 i.dot ///
+if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimxwt], vce(robust)
+
+* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/union", sheet("Process U1b") modify
+putexcel set "$dir_raw_results/partnership/partnership", sheet("Process U1b") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/U1b.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U1b.doc", replace ///
 title("Process U1b: Probit regression estimates for entering a partnership - single respondents aged 18+ not in continuous education") ///
  ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+ 
+gen in_sample = e(sample)	
 
+predict p
 
-******************************************************************
-*Process 2b: Exiting a partnership - Not in continuous education *
-******************************************************************
-*Probability of partnership break-up.
-*Sample: Female member of a couple aged 18+ who were in a partnership at t-1 and not in a partnership at t and were not in continuous education
-fre dcpex if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) //exclude same sex couples
+save "$dir_validation_data/U1b_sample", replace
 
-probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 li.dhe li.dhesp l.dcpyy l.dcpagdf l.dnc l.dnc02 lib1.dhhtp_c4 lib1.lesdf_c4 ///
-l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dhhwt], vce(robust)
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+	
+	
+* Results 	
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_partnership", sheet("U1b") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_partnership", sheet("U1b") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+		
+* Labelling
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Ydses_c5_Q2_L1"
+putexcel A6 = "Ydses_c5_Q3_L1"
+putexcel A7 = "Ydses_c5_Q4_L1"
+putexcel A8 = "Ydses_c5_Q5_L1"
+putexcel A9 = "Dnc_L1"
+putexcel A10 = "Dnc02_L1"
+putexcel A11 = "Dhe_pcs"
+putexcel A12 = "Dhe_mcs"
+putexcel A13 = "UKC"
+putexcel A14 = "UKD"
+putexcel A15 = "UKE"
+putexcel A16 = "UKF"
+putexcel A17 = "UKG"
+putexcel A18 = "UKH"
+putexcel A19 = "UKJ"
+putexcel A20 = "UKK"
+putexcel A21 = "UKL"
+putexcel A22 = "UKM"
+putexcel A23 = "UKN"
+putexcel A24 = "Year_transformed"
+putexcel A25 = "Y2020"
+putexcel A26 = "Y2021"
+putexcel A27 = "Ethn_Asian"
+putexcel A28 = "Ethn_Black"
+putexcel A29 = "Ethn_Other"
+putexcel A30 = "Constant"
+
+putexcel B1 = "Dgn"
+putexcel C1 = "Dag"
+putexcel D1 = "Dag_sq"
+putexcel E1 = "Ydses_c5_Q2_L1"
+putexcel F1 = "Ydses_c5_Q3_L1"
+putexcel G1 = "Ydses_c5_Q4_L1"
+putexcel H1 = "Ydses_c5_Q5_L1"
+putexcel I1 = "Dnc_L1"
+putexcel J1 = "Dnc02_L1"
+putexcel K1 = "Dhe_pcs"
+putexcel L1 = "Dhe_mcs"
+putexcel M1 = "UKC"
+putexcel N1 = "UKD"
+putexcel O1 = "UKE"
+putexcel P1 = "UKF"
+putexcel Q1 = "UKG"
+putexcel R1 = "UKH"
+putexcel S1 = "UKJ"
+putexcel T1 = "UKK"
+putexcel U1 = "UKL"
+putexcel V1 = "UKM"
+putexcel W1 = "UKN"
+putexcel X1 = "Year_transformed"
+putexcel Y1 = "Y2020"
+putexcel Z1 = "Y2021"
+putexcel AA1 = "Ethn_Asian"
+putexcel AB1 = "Ethn_Black"
+putexcel AC1 = "Ethn_Other"
+putexcel AD1 = "Constant"
+
+
+* Goodness of fit 
+
+putexcel set "$dir_results/reg_partnership", sheet("Gof") modify
+
+putexcel A9 = "U1b - Partnership formation, left initial education spell", ///
+	bold		
+
+putexcel A11 = "Pseudo R-squared" 
+putexcel B11 = r2_p 
+putexcel A12 = "N"
+putexcel B12 = N 
+putexcel E11 = "Chi^2"		
+putexcel F11 = chi2
+putexcel E12 = "Log likelihood"		
+putexcel F12 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+
+**********************************************************
+* U2b: Partnership termination, not in initial edu spell *
+**********************************************************
+
+* Process U2b: Probability of partnership break-up.
+* Sample: 	Female member of a heterosexual couple in t-1 aged 18+ and not in 
+* 			continuous education
+* DV: Exit partnership dummy
+* Note:	Requirement to be in a relationship last year is embedded in the DV.
+* 		The ded condition refers to the female partner only. 
+* 		If take away the ded condition doesn't make any difference because there
+* 		are not splits by those in their initial education spell. 
+		
+fre dcpex if (dgn == 0 & dag >= 18 & ded == 0 & ssscp != 1) 
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02  lib1.lesdf_c4 ///
+     l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot ///
+	 if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U2b, dimlwt) side dec(4) 
+
+probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02  lib1.lesdf_c4 ///
+     l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot ///
+	 if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, disclwt) side dec(4)
+
+probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02  lib1.lesdf_c4 ///
+     l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot ///
+	 if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dhhwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, dhhwt) side dec(4) 
+probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02  lib1.lesdf_c4 ///
+     l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot ///
+	 if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_U2b.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02  lib1.lesdf_c4 ///
+     l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot ///
+	 if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimxwt], vce(robust)
+
+	* raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/union", sheet("Process U2b") modify
+putexcel set "$dir_raw_results/partnership/partnership", sheet("Process U2b") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/U2b.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U2b.doc", replace ///
 title("Process U2b: Probit regression estimates for exiting a partnership - cohabiting women aged 18+ not in continuous education") ///
  ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+	
+	
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/U2b_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	 
+scalar chi2 = e(chi2)
+scalar ll = e(ll)
+
+
+* Results 	
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+matrix list  V
+
+*  Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_partnership", sheet("U2b") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_partnership", sheet("U2b") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+	
+* Labelling 
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dag"
+putexcel A3 = "Dag_sq"
+putexcel A4 = "Deh_c3_Medium_L1"
+putexcel A5 = "Deh_c3_Low_L1"
+putexcel A6 = "Dehsp_c3_Medium_L1"
+putexcel A7 = "Dehsp_c3_Low_L1"
+putexcel A8 = "Dhe_pcs"
+putexcel A9 = "Dhe_mcs"
+putexcel A10 = "Dhe_pcssp"
+putexcel A11 = "Dhe_mcssp"
+putexcel A12 = "Dcpyy_L1"
+putexcel A13 = "New_rel_L1"
+putexcel A14 = "Dcpagdf_L1"
+putexcel A15 = "Dnc_L1"
+putexcel A16 = "Dnc02_L1"
+putexcel A17 = "Lesdf_c4_EmployedSpouseNotEmployed_L1"
+putexcel A18 = "Lesdf_c4_NotEmployedSpouseEmployed_L1"
+putexcel A19 = "Lesdf_c4_BothNotEmployed_L1"
+putexcel A20 = "Ypnbihs_dv_L1"
+putexcel A21 = "Ynbcpdf_dv_L1"
+putexcel A22 = "UKC"
+putexcel A23 = "UKD"
+putexcel A24 = "UKE"
+putexcel A25 = "UKF"
+putexcel A26 = "UKG"
+putexcel A27 = "UKH"
+putexcel A28 = "UKJ"
+putexcel A29 = "UKK"
+putexcel A30 = "UKL"
+putexcel A31 = "UKM"
+putexcel A32 = "UKN"
+putexcel A33 = "Year_transformed"
+putexcel A34 = "Y2020"
+putexcel A35 = "Y2021"
+putexcel A36 = "Ethn_Asian"
+putexcel A37 = "Ethn_Black"
+putexcel A38 = "Ethn_Other"
+putexcel A39 = "Constant"
+
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dag"
+putexcel D1 = "Dag_sq"
+putexcel E1 = "Deh_c3_Medium_L1"
+putexcel F1 = "Deh_c3_Low_L1"
+putexcel G1 = "Dehsp_c3_Medium_L1"
+putexcel H1 = "Dehsp_c3_Low_L1"
+putexcel I1 = "Dhe_pcs"
+putexcel J1 = "Dhe_mcs"
+putexcel K1 = "Dhe_pcssp"
+putexcel L1 = "Dhe_mcssp"
+putexcel M1 = "Dcpyy_L1"
+putexcel N1 = "New_rel_L1"
+putexcel O1 = "Dcpagdf_L1"
+putexcel P1 = "Dnc_L1"
+putexcel Q1 = "Dnc02_L1"
+putexcel R1 = "Lesdf_c4_EmployedSpouseNotEmployed_L1"
+putexcel S1 = "Lesdf_c4_NotEmployedSpouseEmployed_L1"
+putexcel T1 = "Lesdf_c4_BothNotEmployed_L1"
+putexcel U1 = "Ypnbihs_dv_L1"
+putexcel V1 = "Ynbcpdf_dv_L1"
+putexcel W1 = "UKC"
+putexcel X1 = "UKD"
+putexcel Y1 = "UKE"
+putexcel Z1 = "UKF"
+putexcel AA1 = "UKG"
+putexcel AB1 = "UKH"
+putexcel AC1 = "UKJ"
+putexcel AD1 = "UKK"
+putexcel AE1 = "UKL"
+putexcel AF1 = "UKM"
+putexcel AG1 = "UKN"
+putexcel AH1 = "Year_transformed"
+putexcel AI1 = "Y2020"
+putexcel AJ1 = "Y2021"
+putexcel AK1 = "Ethn_Asian"
+putexcel AL1 = "Ethn_Black"
+putexcel AM1 = "Ethn_Other"
+putexcel AN1 = "Constant"
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_partnership", sheet("Gof") modify
+
+putexcel A15 = ///
+	"U2b - Partnership termination, left initial education spell", bold		
 
+putexcel A17 = "Pseudo R-squared" 
+putexcel B17 = r2_p 
+putexcel A18 = "N"
+putexcel B18 = N 
+putexcel E17 = "Chi^2"		
+putexcel F17 = chi2
+putexcel E18 = "Log likelihood"		
+putexcel F18 = ll		
 
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+	
+	
 capture log close 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do b/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do
index d0cdecf27..73b1df299 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do
@@ -3,7 +3,11 @@
 * SECTION:			Retirement  
 * OBJECT: 			Final Regresion Models 
 * AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* LAST UPDATE:		1 July 2025 DP
+* COUNTRY: 			UK  
+*
+* NOTES: 			
+* 
 ********************************************************************************
 clear all
 set more off
@@ -13,25 +17,6 @@ set type double
 set maxvar 30000
 
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
-
-* Directory which contains do files
-global dir_do "${dir_work}/do"
-
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
-
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
-
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
-
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_retirement.log", replace
@@ -39,79 +24,475 @@ log using "${dir_log}/reg_retirement.log", replace
 
 use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-*Labeling and formating variables
-label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
-
-label define edd 1 "Degree"	2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
-
-label define gdr 1  "Male" 0 "Female"
-			
-label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" ///
-6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" ///
-12 "Scotland" 13 "Northern Ireland"
-			
-label define yn	1 "Yes" 0 "No"
-
-label define hht 1 "Couples with No Children" 2 "Couples with Children" ///
-				3 "Single with No Children" 4 "Single with Children"
-
-label variable dgn "Gender"
-label variable dag "Age"
-label variable dagsq "Age Squared"
-label variable drgn1 "Region"
-label variable stm "Year"
-label variable les_c3 "Employment Status: 3 Category" 
-label variable dhe "Self-rated Health"
-label variable deh_c3 "Educational Attainment: 3 Category"
-label variable dhhtp_c4 "Household Type: 4 Category"
-
-label value dgn gdr
-label value drgn1 rgna
-label value les_c3 lessp_c3 jbf 
-label value deh_c3 dehsp_c3 edd 
-label value dcpen dcpex dlrtrd dagpns dagpns_sp yn
-label value dhhtp_c4 hht
+do "$dir_do/variable_update"
 
+	
+* sample selection 
 drop if dag < 16
-replace stm = stm - 2000
 
-*check if all covariates are available and recode missing values 
-recode dgn dag dagsq deh_c3 dagpns lesnr_c2 ydses_c5 dlltsd drgn1 stm dcpst drtren dagpns_sp lessp_c3 dlltsd_sp dcpst (-9=.)
 
 xtset idperson swv
 
 
-*******************************************
-*Process R1a: Enter Retirement - Single   *
-*******************************************
-*Sample: Non-partnered individuals aged 50+ who are not yet retired.
-probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 li.ydses_c5 li.dlltsd ib8.drgn1 stm ///
-if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimlwt], vce(robust)
+* Set Excel file 
+
+* Info sheet
+
+putexcel set "$dir_results/reg_retirement", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "Model parameters governing projection of retirement"
+putexcel A2 = "Authors:	Patryk Bronka, Justin van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+
+putexcel A5 = "R1a"
+putexcel B5 = "Probit regression estimates of the probability of retiring, single individuals aged 50+ not yet retired"
+
+putexcel A6 = "R1b"
+putexcel B6 = "Probit regression estimates of the probability of retiring, cohabiting individuals aged 50+ not yet retired"
+
+putexcel A10 = "Notes:", bold
+putexcel B10 = "replaced dlltsd with dlltsd01; added dhe_pcs and dhe_mcs, ethnicity-4 cat(dot) and Covid dummies (y2020 y2021)"
+
+putexcel set "$dir_results/reg_retirement", sheet("Gof") modify
+putexcel A1 = "Goodness of fit", bold		
+
+
+****************************
+* R1a: Retirement - Single *
+****************************
+
+* Process R1a: Probability retire if single 
+* Sample: Non-partnered individuals aged 50+ who are not yet retired.
+* DV: Enter retirement dummy (have to not be retired last year)
+
+fre drtren if ((dcpst==2 | dcpst==3) & dag>=50)
+
+/*/////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+    li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot ///
+ if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(R1a, dimlwt) side dec(4) 
+
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+    li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot ///
+ if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1a, disclwt) side dec(4)
+
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+    li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot ///
+ if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1a, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_R1a.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+    li.ydses_c5 li.dlltsd01 l.dhe_pcs l.dhe_mcs ///
+	ib8.drgn1 stm y2020 y2021 i.dot ///
+ if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimxwt], vce(robust)
+
+   * raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/retire", sheet("Process R1a") replace
+putexcel set "$dir_raw_results/retirement/retirement", sheet("Process R1a") replace
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/R1a.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/retirement/R1a.doc", replace ///
 title("Process R1a: Probit regression estimates for retiring - single individuals aged 50+ not yet retired") ///
  ctitle(retiring) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+gen in_sample = e(sample)	
 
+predict p
+
+save "$dir_validation_data/R1a_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+	
+	
+* Rresults	
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+* Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/retirement/var_cov", sheet("var_cov") ///
+	replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/retirement/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_retirement", sheet("R1a") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_retirement", sheet("R1a") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2)  
  
-*********************************************
-*Process R1b: Enter Retirement - Partnered  *
-*********************************************
-*Sample: Partnered individuals aged 50+ who are not yet retired.
-probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp li.lessp_c3 li.dlltsd_sp ///
-ib8.drgn1 stm if (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimlwt], vce(robust)
+ 
+* Labelling 
+ 
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Deh_c3_Medium"
+putexcel A6 = "Deh_c3_Low"
+putexcel A7 = "Reached_Retirement_Age"
+putexcel A8 = "Lesnr_c2_NotEmployed_L1"
+putexcel A9 = "Ydses_c5_Q2_L1"
+putexcel A10 = "Ydses_c5_Q3_L1"
+putexcel A11 = "Ydses_c5_Q4_L1"
+putexcel A12 = "Ydses_c5_Q5_L1"
+putexcel A13 = "Dlltsd01_L1"
+putexcel A14 = "Dhe_pcs_L1"
+putexcel A15 = "Dhe_mcs_L1"
+putexcel A16 = "UKC"
+putexcel A17 = "UKD"
+putexcel A18 = "UKE"
+putexcel A19 = "UKF"
+putexcel A20 = "UKG"
+putexcel A21 = "UKH"
+putexcel A22 = "UKJ"
+putexcel A23 = "UKK"
+putexcel A24 = "UKL"
+putexcel A25 = "UKM"
+putexcel A26 = "UKN"
+putexcel A27 = "Year_transformed"
+putexcel A28 = "Y2020"
+putexcel A29 = "Y2021"
+putexcel A30 = "Ethn_Asian"
+putexcel A31 = "Ethn_Black"
+putexcel A32 = "Ethn_Other"
+putexcel A33 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Deh_c3_Medium"
+putexcel G1 = "Deh_c3_Low"
+putexcel H1 = "Reached_Retirement_Age"
+putexcel I1 = "Lesnr_c2_NotEmployed_L1"
+putexcel J1 = "Ydses_c5_Q2_L1"
+putexcel K1 = "Ydses_c5_Q3_L1"
+putexcel L1 = "Ydses_c5_Q4_L1"
+putexcel M1 = "Ydses_c5_Q5_L1"
+putexcel N1 = "Dlltsd01_L1"
+putexcel O1 = "Dhe_pcs_L1"
+putexcel P1 = "Dhe_mcs_L1"
+putexcel Q1 = "UKC"
+putexcel R1 = "UKD"
+putexcel S1 = "UKE"
+putexcel T1 = "UKF"
+putexcel U1 = "UKG"
+putexcel V1 = "UKH"
+putexcel W1 = "UKJ"
+putexcel X1 = "UKK"
+putexcel Y1 = "UKL"
+putexcel Z1 = "UKM"
+putexcel AA1 = "UKN"
+putexcel AB1 = "Year_transformed"
+putexcel AC1 = "Y2020"
+putexcel AD1 = "Y2021"
+putexcel AE1 = "Ethn_Asian"
+putexcel AF1 = "Ethn_Black"
+putexcel AG1 = "Ethn_Other"
+putexcel AH1 = "Constant"
+
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_retirement", sheet("Gof") modify
+
+putexcel A3 = "R1a - Retirement single", bold		
+
+putexcel A5 = "Pseudo R-squared" 
+putexcel B5 = r2_p 
+putexcel A6 = "N"
+putexcel B6 = N 
+putexcel E5 = "Chi^2"		
+putexcel F5 = chi2
+putexcel E6 = "Log likelihood"		
+putexcel F6 = ll		
+
+drop in_sample p
+scalar drop r2_p N chi2 ll	
+
+
+
+
+******************************
+* R1b: Retirement, partnered *
+******************************
+
+* Process R1b: Probability retire 
+* Sample: Partnered heterosexual individuals aged 50+ who are not yet retired
+* DV: Enter retirement dummy (have to not be retired last year)
+count if (ssscp!=1 & dcpst==1 & dag>=50) & lessp_c3==2 //115 obs partnered with students 
+drop if (ssscp!=1 & dcpst==1 & dag>=50) & lessp_c3==2 //drop partnered with students 
+
+fre drtren if (ssscp!=1 & dcpst==1 & dag>=50)
+
+/*//////////////////////////////////////////////////////////////////////////////////////////////////	 
+//check weights //////////////////////////////////////////////////////////////////////////////////	 
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+     i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp ///
+     li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm  y2020 y2021 i.dot if ///
+	 (ssscp!=1 & dcpst==1 & dag>=50)  [pweight=dimlwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(R1b, dimlwt) side dec(4) 
+
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+     i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp ///
+     li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm  y2020 y2021 i.dot if ///
+	 (ssscp!=1 & dcpst==1 & dag>=50)  [pweight=disclwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1b, disclwt) side dec(4)
+
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+     i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp ///
+     li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm  y2020 y2021 i.dot if ///
+	 (ssscp!=1 & dcpst==1 & dag>=50)  [pweight=dimxwt], vce(robust)
+outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1b, dimxwt) side dec(4) 
+erase "${weight_checks}/weight_comparison_R1b.txt"
+//////////////////////////////////////////////////////////////////////////////////////////////////// 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+*/
+
+probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 ///
+     i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd01 l.dhe_pcs l.dhe_mcs i.dagpns_sp ///
+     li.lessp_c3 li.dlltsd01_sp ib8.drgn1 stm y2020 y2021 i.dot if ///
+	 (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimxwt], vce(robust)
+
+   * raw results 
 matrix results = r(table)
 matrix results = results[1..6,1...]'
-putexcel set "$dir_data/retire", sheet("Process R1b") modify
+putexcel set "$dir_raw_results/retirement/retirement", sheet("Process R1b") modify
 putexcel A3 = matrix(results), names nformat(number_d2) 
 putexcel J4 = matrix(e(V))
-outreg2 stats(coef se pval) using "$dir_data/R1b.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/retirement/R1b.doc", replace ///
 title("Process R1b: Probit regression estimates for retiring - cohabiting individuals aged 50+ not yet retired") ///
  ctitle(retiring) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll))
+	
+
+gen in_sample = e(sample)	
+
+predict p
+
+save "$dir_validation_data/R1b_sample", replace
+
+scalar r2_p = e(r2_p) 
+scalar N = e(N)	
+scalar chi2 = e(chi2)
+scalar ll = e(ll)	
+	
+
+* Results 
+* Note: Zeros values are eliminated 
+	
+matrix b = e(b)	
+matrix V = e(V)
+
+
+* Store variance-covariance matrix 
+
+preserve
+
+putexcel set "$dir_raw_results/retirement/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/retirement/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+putexcel set "$dir_results/reg_retirement", sheet("R1b") modify
+putexcel C2 = matrix(var)
+		
+restore	
+
+
+* Store estimated coefficients 
+
+// Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+// Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+// Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+// Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_results/reg_retirement", sheet("R1b") modify
+putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) 
+	
+
+* Labelling
+
+putexcel A1 = "REGRESSOR"
+putexcel A2 = "Dgn"
+putexcel A3 = "Dag"
+putexcel A4 = "Dag_sq"
+putexcel A5 = "Deh_c3_Medium"
+putexcel A6 = "Deh_c3_Low"
+putexcel A7 = "Reached_Retirement_Age"
+putexcel A8 = "Lesnr_c2_NotEmployed_L1"
+putexcel A9 = "Reached_Retirement_Age_Lesnr_c2_NotEmployed_L1"
+putexcel A10 = "Ydses_c5_Q2_L1"
+putexcel A11 = "Ydses_c5_Q3_L1"
+putexcel A12 = "Ydses_c5_Q4_L1"
+putexcel A13 = "Ydses_c5_Q5_L1"
+putexcel A14 = "Dlltsd01_L1"
+putexcel A15 = "Dhe_pcs_L1"
+putexcel A16 = "Dhe_mcs_L1"
+putexcel A17 = "Reached_Retirement_Age_Sp"
+putexcel A18 = "Lessp_c3_NotEmployed_L1"
+putexcel A19 = "Dlltsd01_sp_L1"
+putexcel A20 = "UKC"
+putexcel A21 = "UKD"
+putexcel A22 = "UKE"
+putexcel A23 = "UKF"
+putexcel A24 = "UKG"
+putexcel A25 = "UKH"
+putexcel A26 = "UKJ"
+putexcel A27 = "UKK"
+putexcel A28 = "UKL"
+putexcel A29 = "UKM"
+putexcel A30 = "UKN"
+putexcel A31 = "Year_transformed"
+putexcel A32 = "Y2020"
+putexcel A33 = "Y2021"
+putexcel A34 = "Ethn_Asian"
+putexcel A35 = "Ethn_Black"
+putexcel A36 = "Ethn_Other"
+putexcel A37 = "Constant"
+
+putexcel B1 = "COEFFICIENT"
+putexcel C1 = "Dgn"
+putexcel D1 = "Dag"
+putexcel E1 = "Dag_sq"
+putexcel F1 = "Deh_c3_Medium"
+putexcel G1 = "Deh_c3_Low"
+putexcel H1 = "Reached_Retirement_Age"
+putexcel I1 = "Lesnr_c2_NotEmployed_L1"
+putexcel J1 = "Reached_Retirement_Age_Les_c3_NotEmployed_L1"
+putexcel K1 = "Ydses_c5_Q2_L1"
+putexcel L1 = "Ydses_c5_Q3_L1"
+putexcel M1 = "Ydses_c5_Q4_L1"
+putexcel N1 = "Ydses_c5_Q5_L1"
+putexcel O1 = "Dlltsd01_L1"
+putexcel P1 = "Dhe_pcs_L1"
+putexcel Q1 = "Dhe_mcs_L1"
+putexcel R1 = "Reached_Retirement_Age_Sp"
+putexcel S1 = "Lessp_c3_NotEmployed_L1"
+putexcel T1 = "Dlltsd01_sp_L1"
+putexcel U1 = "UKC"
+putexcel V1 = "UKD"
+putexcel W1 = "UKE"
+putexcel X1 = "UKF"
+putexcel Y1 = "UKG"
+putexcel Z1 = "UKH"
+putexcel AA1 = "UKJ"
+putexcel AB1 = "UKK"
+putexcel AC1 = "UKL"
+putexcel AD1 = "UKM"
+putexcel AE1 = "UKN"
+putexcel AF1 = "Year_transformed"
+putexcel AG1 = "Y2020"
+putexcel AH1 = "Y2021"
+putexcel AI1 = "Ethn_Asian"
+putexcel AJ1 = "Ethn_Black"
+putexcel AK1 = "Ethn_Other"
+putexcel AL1 = "Constant"
+
+
+* Goodness of fit
+
+putexcel set "$dir_results/reg_retirement", sheet("Gof") modify
+
+putexcel A9 = "R1b - Retirement partnered", bold		
+
+putexcel A11 = "Pseudo R-squared" 
+putexcel B11 = r2_p 
+putexcel A12 = "N"
+putexcel B12 = N 
+putexcel E11 = "Chi^2"		
+putexcel F11 = chi2
+putexcel E12 = "Log likelihood"		
+putexcel F12 = ll		
 
+drop in_sample p
+scalar drop r2_p N chi2 ll	
 
 capture log close 
 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do b/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do
index 64f1787d5..afdfca159 100644
--- a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do
+++ b/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do
@@ -2,8 +2,8 @@
 * PROJECT:  		ESPON 
 * SECTION:			Wage regression 
 * OBJECT: 			Heckman regressions 
-* AUTHORS:			Daria Popova, Justin van de Ven
-* LAST UPDATE:		21/04/2024 (JV)
+* AUTHORS:			Patryk Bronka, Daria Popova, Justin van de Ven
+* LAST UPDATE:		3 July 2025 DP 
 ********************************************************************************
 clear all
 set more off
@@ -13,25 +13,6 @@ set type double
 set maxvar 30000
 
 
-/*******************************************************************************
-*	DEFINE DIRECTORIES
-*******************************************************************************/
-* Working directory
-global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates"
-
-* Directory which contains do files
-global dir_do "${dir_work}/do"
-
-* Directory which contains data files 
-global dir_data "${dir_work}/data"
-
-* Directory which contains log files 
-global dir_log "${dir_work}/log"
-
-* Directory which contains pooled UKHLS dataset 
-global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data"
-
-
 *******************************************************************
 cap log close 
 log using "${dir_log}/reg_wages.log", replace
@@ -75,45 +56,45 @@ program computePredicted
 	
 end
 
-capture program drop analyseFit
+capture program drop analyseFit 
 program analyseFit
 
-	local filter = "`1'"
+    // 1 = filter
+    // 2 = optional flag "nocorr"
+    // 3 = title
+    // 4 = suffix for filename
+
+    local filter = "`1'"
+
+    quietly sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter'
+
+    if "`2'" != "nocorr" {
+        corr wage_hour L1.wage_hour if `filter' & previouslyWorking
+        corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking
+    }
+
+    // Log wage graph
+    twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12) ) ///
+        (hist lwage_hour_hat if `filter', fcolor(none) lcolor(red) ), ///
+        xtitle("log gross hourly wages (GBP)") ///
+        legend(label(1 "observed") label(2 "predicted")) ///
+        name(log, replace) ///
+        title("`3'")
+    
+    graph export "${dir_validation_graphs}/wages/log_`4'.png", replace
+
+    // Level wage graph
+    twoway (hist wage_hour if `filter' & wage_hour < 150, percent lcolor(gs12) fcolor(gs12) start(0) width(1)) ///
+        (hist wage_hour_hat if `filter' & wage_hour_hat < 150, percent fcolor(none) lcolor(red) start(0) width(1)), ///
+        xtitle("gross hourly wages (GBP)") ///
+        legend(label(1 "observed") label(2 "predicted")) ///
+        name(levels, replace) ///
+        title("`3'")
+    
+    graph export "${dir_validation_graphs}/wages/level_`4'.png", replace
 
-	sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter'
-	if ("`2'" != "nocorr") {
-		corr wage_hour L1.wage_hour if `filter' & previouslyWorking
-		corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking
-	}
-		
-	twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12)) ///
-		(hist lwage_hour_hat if `filter', fcolor(none) lcolor(red)), xtitle (log gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(log, replace)
-	
-	twoway (hist wage_hour if `filter' & wage_hour < 150, lcolor(gs12) fcolor(gs12)) ///
-		(hist wage_hour_hat if `filter' & wage_hour_hat < 150, fcolor(none) lcolor(red)), xtitle (gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(levels, replace)
-		
 end
 
-capture program drop analyseFit2
-program analyseFit2
-
-	local filter = "`1'"
-
-	sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter'
-	if ("`2'" != "nocorr") {
-		corr wage_hour L1.wage_hour if `filter' & previouslyWorking
-		corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking
-	}
-		
-	twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12)) ///
-		(hist lwage_hour_hat if `filter', fcolor(none) lcolor(red)), xtitle (log gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(log, replace) title("`3'")
-		graph export "${dir_graphs}/log_`4'", replace 
-	
-	twoway (hist wage_hour if `filter' & wage_hour < 150, lcolor(gs12) fcolor(gs12)) ///
-		(hist wage_hour_hat if `filter' & wage_hour_hat < 150, fcolor(none) lcolor(red)), xtitle (gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(levels, replace) title("`3'") 
-		graph export "${dir_graphs}/level_`4'", replace
-
-end
 
 capture program drop outputResults
 program outputResults
@@ -122,12 +103,12 @@ program outputResults
 	
 	matrix results = r(table)
 	matrix results = results[1..6,1...]'   //extract the first six rows of results, and then transpose results
-	putexcel set "$dir_data/`outputFile'.xlsx", sheet("Estimates") replace 
+	putexcel set "$dir_raw_results/wages/`outputFile'.xlsx", sheet("Estimates") replace 
 	
 	putexcel A3 = matrix(results), names nformat(number_d2)
 	
 	matrix results = e(V)
-	putexcel set "$dir_data/`outputFile'.xlsx", sheet("Varcov") modify
+	putexcel set "$dir_raw_results/wages/`outputFile'.xlsx", sheet("Varcov") modify
 	putexcel A3 = matrix(results), names nformat(number_d2)
 		
 end
@@ -171,7 +152,7 @@ save "$work_dir/growth_rates", replace
 
 // Note: use code above if calculating real wage growth inside of the simulation, but if loading from excel use values from excel in Stata too. 
 //They *should* be the same but it is more consistent to have one source of values. 
-import excel "$dir_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real wage growth rates
+import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real wage growth rates
 rename Year stm
 rename Value real_wage_growth
 replace stm = stm - 2000
@@ -179,7 +160,7 @@ sum real_wage_growth if stm == 15
 gen base = r(mean)
 replace real_wage_growth = real_wage_growth / base // Note: switching from 100 base to 1 base as that's what happens in the simulation when rebasing indices
 drop base
-save "$dir_data/growth_rates", replace
+save "$dir_external_data/growth_rates", replace
 
 
 /**************************************************************/
@@ -189,46 +170,32 @@ save "$dir_data/growth_rates", replace
 /**************************************************************/
 use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear
 
-drop if dag < $min_age
+do "$dir_do/variable_update"
+
+drop if dag < $min_age 
+
 * screen data to ensure that idperson and swv uniquely identify observations
 sort idperson swv
+duplicates report idperson swv
 gen chk = 0
 replace chk = 1 if (idperson == idperson[_n-1] & swv == swv[_n-1])
 drop if chk == 1
 
-* Fill in missing information on year (stm) based on wave (swv)
-/*
-replace stm = 2009 if swv == 1 & missing(stm)
-replace stm = 2010 if swv == 2 & missing(stm)
-replace stm = 2011 if swv == 3 & missing(stm)
-replace stm = 2012 if swv == 4 & missing(stm)
-replace stm = 2013 if swv == 5 & missing(stm)
-replace stm = 2014 if swv == 6 & missing(stm)
-replace stm = 2015 if swv == 7 & missing(stm)
-replace stm = 2016 if swv == 8 & missing(stm)
-replace stm = 2017 if swv == 9 & missing(stm)
-replace stm = 2018 if swv == 10 & missing(stm)
-replace stm = 2019 if swv == 11 & missing(stm)
-replace stm = 2020 if swv == 12 & missing(stm)
-replace stm = 2021 if swv == 13 & missing(stm)
-*/
-
-replace stm = stm - 2000
-
 
 /**************************************************************/
 *
 *	merge in real growth index from microsimulation's input folder
 *
 /**************************************************************/
-merge m:1 stm using "$dir_data/growth_rates", keep(3) nogen keepusing(real_wage_growth)
+merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(real_wage_growth)
 
 //rename drgnl drgn1 // Rename region variable to drgn1 (one, not "l")
 
 *Variable stm identifies time periods. Need to ensure that combining idperson and stm ensures uniqueness.
+duplicates report idperson stm
 duplicates tag idperson stm, gen(dup)
-sort idperson stm
-//DP: no such cases //
+sort idperson stm 
+/*DP: no duplicates in terms of idperson and stm therefore the code below in no longer needed 
 
 *However, this affects many variables: idhh, dag, ddt, dpd, ddt01, potentially idpartner. Might be best to move entire household. 
 *Furthermore, the duplicated observation can occur in a year for which y-1 and y+1 have been observed. 
@@ -250,19 +217,16 @@ gen count_year = stm - min_observed_year
 sort idperson stm swv // Sort interview date in ascending order - earliest interview will be the one with the gap_prev set to 1
 by idperson: gen gap_prev = (((count_year - count_year[_n-1]) > 1) & count_year>0) // There is a gap in year -1
 by idperson: replace gap_prev = 1 if _n == 1 & dup == 1 & stm > 2009
-//DP: 0 cases 
 
 gsort +idperson -stm -swv // Sort years in reverse order. Sort int date in descending order - later interview will be the one with gap_next set to 1
 by idperson: gen gap_next = (((count_year - count_year[_n-1]) < -1) & stm != 2018) // There is a gap in year +1
 sort idperson stm swv
 by idperson: replace gap_next = 1 if _n == _N & dup == 1 
-//DP: 1,547 real changes made
 by idperson: replace gap_prev = 0 if gap_next[_n-1] == 1 & dup[_n-1] == 1 // If previous observation already has flag set to move to next period, can't move another one to the same period
-//DP: 3,193 real changes made
+
 
 *Check if whole household is duplicated
 bys idhh swv: egen min_dup = min(dup) // If == 1, then every observation for that household is duplicated
-// 18480 cases 
 
 *Check if whole household can be moved either back or forward:
 bys idhh stm: egen hh_gap_prev = min(gap_prev)
@@ -270,12 +234,12 @@ bys idhh stm: egen hh_gap_next = min(gap_next)
 
 *Generate identifier for the whole household which should be moved: move the observation from the wave which is closer to the gap
 gen move = 1 if dup == 1 & (hh_gap_prev == 1 | hh_gap_next == 1) & min_dup == 1
-//DP: 6548 cases 
 
 *Move observations:
 replace stm = stm-1 if move == 1 & hh_gap_prev == 1 /*3,425 real changes made*/
 replace stm = stm+1 if move == 1 & hh_gap_next == 1 /*3,123 real changes made*/
 
+
 *Drop households with duplicated observations, keeping observations from more recent waves if duplicated years:
 sort stm idperson swv
 drop dup
@@ -284,13 +248,18 @@ by stm idperson: egen max_wave = max(swv) // Keep more recent obs
 gen drop_idhh = idhh if max_wave == swv & dup == 1 // This identifies idhh which should be dropped
 bys idhh stm: egen drop_idhh_max = max(drop_idhh) 
 drop if !missing(drop_idhh_max)
-//DP: 8,119 observations deleted
-duplicates drop idperson stm, force // Few duplicates left, drop
 
-
-****************************************
+duplicates drop idperson stm, force 
+*/
+ 
+/**************************************************************/
+*
+*	preliminaries
+*
+/**************************************************************/
 * Setting STATA to recognize Panel Data
 xtset idperson stm
+
 * total hours work per week (average)
 gen hours = 0
 replace hours = jbhrs if ((jbhrs > 0) & (jbhrs < .))
@@ -334,9 +303,13 @@ gen yplgrs_dv_level = sinh(yplgrs_dv)
 gen wage_hour = .
 replace wage_hour = yplgrs_dv_level / hours / 4.333 if (yplgrs_dv_level >= 50 & yplgrs_dv_level <= 83333 & hours >= 1 & hours <= 100)
 sum wage_hour, det
+fre wage_hour if wage_hour==0
+fre wage_hour if wage_hour==.
 *replace wage_hour = . if wage_hour < 4 | wage_hour > 70
+
 * relationship status (1=cohabitating)
 gen mar = (dcpst==1)
+
 * children
 gen any02 = dnc02 > 0
 gen dnc4p = dnc
@@ -344,17 +317,14 @@ replace dnc4p = 1 if (dnc>4)
 gen dnc2p = dnc
 replace dnc2p = 2 if (dnc>2)
 cap gen child = (dnc>0)
-* individual weights
-by idperson: egen wgt = mean(dimlwt)
-* 
 
-
-/**************************************************************/
-*
-*	preliminaries
-*
-/**************************************************************/
+* individual weights
+//by idperson: egen wgt = mean(dimlwt)
+by idperson: egen wgt = mean(dimxwt)
+ 
+* ln wages 
 gen lwage_hour = ln(wage_hour)
+
 hist lwage_hour if lwage_hour > 0 & lwage_hour < 4.4
 
 gen swage_hour = asinh(wage_hour)
@@ -362,20 +332,77 @@ hist swage_hour if (swage_hour > 1 & swage_hour < 5)
 
 replace lwage_hour = . if (wage_hour<5 | wage_hour>1000)
 
+gen lwage_hour_2 = lwage_hour^2
+
+*correct employment status 
 replace les_c3 = 3 if lwage_hour == . & les_c3 ! = 2 // PB: employment status is set on the basis of hourly wage not missing, so recode labour market activity status to match this for non-students
 replace les_c3 = 1 if lwage_hour != . // PB: as above, if wage present consider as employed
 
 recode deh_c3 dehm_c3 dehf_c3 drgn1 dhe (-9=.)
 
+gen L1les_c3 = L1.les_c3
+
+*part time work 
+gen pt = (hours >  0) * (hours <= 25)
+drop hrs0_m1 hrs1_m1
+
+
+
+*****************************************************************************************************************************
+* Set Excel file 
+* Info sheet - first stage 
+putexcel set "$dir_results/reg_employmentSelection", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "This file contains regression estimates from the first stage of the Heckman selection model used to estimates wages."
+putexcel A2 = "Authors:	Patryk Bronka, Justin Van de Ven, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "EmploymentSelection_FemaleNE"
+putexcel B5 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year"
+putexcel A6 = "EmploymentSelection_MaleNE"
+putexcel B6 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year"
+putexcel A7 = "EmploymentSelection_FemaleE"
+putexcel B7 = "First stage Heckman selection estimates for women that have an observed wage in the previous year"
+putexcel A8 = "EmploymentSelection_MaleE"
+putexcel B8 = "First stage Heckman selection estimates for men that have an observed wage in the previous year"
+
+putexcel A11 = "Notes:", bold
+putexcel B11 = "Estimated on panel data unlike the labour supply estimates"
+putexcel B12 = "Predicted wages used as input into union parameters and income process estimates"
+putexcel B13 = "Two-step Heckman command is used which does not permit weights"
+
+* Info sheet - second stage 
+putexcel set "$dir_results/reg_wages", sheet("Info") replace
+putexcel A1 = "Description:"
+putexcel B1 = "This file contains regression estimates used to calculate potential wages for males and females in the simulation."
+putexcel A2 = "Authors:	Patryk Bronka, Daria Popova" 
+putexcel A3 = "Last edit: 1 July 2025 DP"
+
+putexcel A4 = "Process:", bold
+putexcel B4 = "Description:", bold
+putexcel A5 = "Wages_FemalesNE"
+putexcel B5 = "Heckman selection estimates using women that do not have an observed wage in the previous year"
+putexcel A6 = "Wages_MalesNE"
+putexcel B6 = "Heckman selection estimates using men that do not have an observed wage in the previous year"
+putexcel A7 = "Wages_FemalesE"
+putexcel B7 = "Heckman selection estimates using women that have an observed wage in the previous year"
+putexcel A8 = "Wages_MalesE"
+putexcel B8 = "Heckman selection estimates using men that have an observed wage in the previous year"
+
+putexcel A11 = "Notes:", bold
+putexcel B11 = "Estimated on panel data unlike the labour supply estimates"
+putexcel B12 = "Predicted wages used as input into union parameters and income process estimates"
+putexcel B13 = "Two-step Heckman command is used which does not permit weights"
+putexcel B14 = "Regions: London is the reference region" 
+
 
 /**************************************************************/
 *
-*	pooled cross-sectional regressions
+*	Regressions
 *
 /**************************************************************/
-gen pt = (hours >  0) * (hours <= 25)
-drop hrs0_m1 hrs1_m1
-
 * Strategy: 
 * 1) Heckman estimated on the sub-sample of individuals who were not observed working in previous period. 
 *    Wage equation does not controls for lagged wage
@@ -384,103 +411,880 @@ drop hrs0_m1 hrs1_m1
 * Specification of selection equation is the same in the two samples
 
 * Flag to identify observations to be included in the estimation sample 
+/* The sample should include only individuals who are observed for at least two periods, and then the first observation should not be used in the estimation. */
 bys idperson: gen obs_count = _N
-gen in_sample = (obs_count > 1 & swv > 1)
+gen in_sample = (obs_count > 1 & swv > 1) 
 
 * Flag to distinguish the two samples
 capture drop previouslyWorking
-gen previouslyWorking = (L1.lwage_hour != .) /* PB 07.02.2023: I think this will set previosuly working to 0 for everyone 
-who is not observed in the previous period, e.g. all observations at Wave 1. I think the sample should include only individuals 
-who are observed for at least two periods, and then the first observation should not be used in the estimation. */
+gen previouslyWorking = (L1.lwage_hour != .) 
+fre previouslyWorking
 
+* Prep storage 
 capture drop lwage_hour_hat wage_hour_hat esample
 gen lwage_hour_hat = .
 gen wage_hour_hat = .
 gen esample = .
-
-gen L1les_c3 = L1.les_c3
-gen lwage_hour_2 = lwage_hour^2
-
 gen pred_hourly_wage = .
 
 *** 1) Heckman estimated on the sub-sample of individuals who were not observed working in previous period. 
 ****   Wage equation does not control for lagged wage
-
+**************************************************************************************************************************
 * women
-global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1  pt real_wage_growth"
-global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " 
+**************************************************************************************************************************
+global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs  ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe
+global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe
 local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & !previouslyWorking"
 *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust)
-heckman $wage_eqn if `filter', select($seln_eqn) twostep
+heckman $wage_eqn if `filter', select($seln_eqn) twostep 
 outputResults "Not-working women3"
 
-outreg2 stats(coef se pval) using "$dir_data/Output_NWW.doc", replace ///
+outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWW.doc", replace ///
 title("Heckman-corrected wage equation estimated on the sample of women who were not in employment last year") ///
- ctitle(In education) label side dec(2) noparen 
-
+ ctitle(Not working women) label side dec(2) noparen 
+ 
+ 
 *xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2)
 computePredicted "heckman" `filter'
-analyseFit "e(sample)" "nocorr"
-replace esample = 1 if e(sample)
-replace pred_hourly_wage = wage_hour_hat if e(sample)
+analyseFit "e(sample)" "nocorr" "Not working women, 17-64 years" "NWW"
+gen in_sample_fnpw = e(sample)
+replace pred_hourly_wage = wage_hour_hat if in_sample_fnpw
 
+* Save sample for later use (internal validation)
+save "$dir_validation_data/Female_NPW_sample", replace 
 
+* Formatted results
+* Clean up matrix of estimates 
+* Note: Zeros values are eliminated 
+matrix b = e(b)	
+matrix V = e(V)
 
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+
+* Second stage
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") replace
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+* Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") modify
+putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) 
+
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow ///
+	clear
+ds 
+
+drop if C == 0 // UPDATE 
+drop A 
+drop AH-BM // UPDATE
+
+
+
+mkmat *, matrix(Females_NLW)
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_FemalesNE") modify 
+putexcel B2 = matrix(Females_NLW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_FemalesNE") modify 
+
+local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant InverseMillsRatio
+
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+
+* First stage
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow ///
+	clear
+ds 
+
+drop if AN == 0 // UPDATE
+drop A 
+drop C-AG // UPDATE
+drop BN // UPDATE
+
+
+mkmat *, matrix(Females_NLW)
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_FemaleNE") modify 
+putexcel B2 = matrix(Females_NLW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_FemaleNE") modify 
+	
+local var_list Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant 
+	
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+cap drop lambda
+
+
+* Calculate RMSE 
+cap drop residuals squared_residuals  
+gen residuals = lwage_hour - lwage_hour_hat
+gen squared_residuals = residuals^2
+
+preserve 
+keep if `filter'
+sum squared_residuals 
+di "RMSE for Not employed women:  " sqrt(r(mean))
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A1=("REGRESSOR") B1=("COEFFICIENT") ///
+A2=("Wages_FemalesNE") B2=(sqrt(r(mean))) 
+restore 
+
+
+****************************************************************************************************************************
 * men
-global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1  pt real_wage_growth"
-global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " 
+****************************************************************************************************************************
+global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs  ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe
+global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe
 local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & !previouslyWorking"
 *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust)
-heckman $wage_eqn if `filter', select($seln_eqn) twostep
+heckman $wage_eqn if `filter', select($seln_eqn) twostep 
 outputResults "Not-working men3"
 
-outreg2 stats(coef se pval) using "$dir_data/Output_NWM.doc", replace ///
-title("Heckman-corrected wage equation estimated on the sample of men who were not in employment in the previous year") ///
-ctitle(Wage equation coef.) label side dec(2) noparen 
-
+outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWM.doc", replace ///
+title("Heckman-corrected wage equation estimated on the sample of men who were not in employment last year") ///
+ ctitle(Not working men) label side dec(2) noparen 
+ 
+ 
+*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2)
 computePredicted "heckman" `filter'
-analyseFit "e(sample)" "nocorr"
-replace esample = 1 if e(sample)
-replace pred_hourly_wage = wage_hour_hat if e(sample)
+analyseFit "e(sample)" "nocorr" "Not working men, 17-64 years" "NWM"
+gen in_sample_mnpw = e(sample)
+replace pred_hourly_wage = wage_hour_hat if in_sample_mnpw
+
+* Save sample for later use (internal validation)
+save "$dir_validation_data/Male_NPW_sample", replace 
+
+* Formatted results
+* Clean up matrix of estimates 
+* Note: Zeros values are eliminated 
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+
+* Second stage
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") replace
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+* Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") modify
+putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) 
+
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow ///
+	clear
+ds 
+
+drop if C == 0 // UPDATE 
+drop A 
+drop AH-BM // UPDATE
+
+
+
+mkmat *, matrix(Males_NLW)
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_MalesNE") modify 
+putexcel B2 = matrix(Males_NLW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_MalesNE") modify 
+
+local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant InverseMillsRatio
+
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+
+* First stage
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow ///
+	clear
+ds 
+
+drop if AN == 0 // UPDATE
+drop A 
+drop C-AG // UPDATE
+drop BN // UPDATE
+
+
+mkmat *, matrix(Males_NLW)
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_MaleNE") modify 
+putexcel B2 = matrix(Males_NLW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_MaleNE") modify 
+	
+local var_list Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other Constant 
+	
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+cap drop lambda
+
+* Calculate RMSE 
+cap drop residuals squared_residuals  
+gen residuals = lwage_hour - lwage_hour_hat
+gen squared_residuals = residuals^2
+
+preserve 
+keep if `filter'
+sum squared_residuals 
+di "RMSE for Not employed men:  " sqrt(r(mean))
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A1=("REGRESSOR") B1=("COEFFICIENT") ///
+A3=("Wages_MalesNE") B3=(sqrt(r(mean))) 
+restore 
+
 
 *** 2) Heckman estimated on the sub-sample of individuals who were observed working in previous period. 
 ***    Wage equation controls for lagged wage
-
+***************************************************************************************************************************************
 * women
-global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth"
-global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " 
-local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & swv > 1 & previouslyWorking"
+***************************************************************************************************************************************
+global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs  ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe
+global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe
+local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & previouslyWorking"
 *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust)
-heckman $wage_eqn if `filter', select($seln_eqn) twostep
+heckman $wage_eqn if `filter', select($seln_eqn) twostep 
 outputResults "Working women3"
 
-outreg2 stats(coef se pval) using "$dir_data/Output_WW.doc", replace ///
-title("Heckman-corrected wage equation estimated on the sample of women who were in employment in the previous year") ///
- ctitle(Wage equation coef.) label side dec(2) noparen 
-
+outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WW.doc", replace ///
+title("Heckman-corrected wage equation estimated on the sample of women who were in employment last year") ///
+ ctitle(Working women) label side dec(2) noparen 
+ 
+ 
+*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2)
 computePredicted "heckman" `filter'
-analyseFit "e(sample)" 
-replace esample = 1 if e(sample)
-replace pred_hourly_wage = wage_hour_hat if e(sample)
+analyseFit "e(sample)" "nocorr" "Working women, 17-64 years" "WW"
+gen in_sample_fpw = e(sample)
+replace pred_hourly_wage = wage_hour_hat if in_sample_fpw
+
+* Save sample for later use (internal validation)
+save "$dir_validation_data/Female_PW_sample", replace 
+
+* Formatted results
+* Clean up matrix of estimates 
+* Note: Zeros values are eliminated 
+matrix b = e(b)	
+matrix V = e(V)
+
+* Store variance-covariance matrix 
+preserve
 
+putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+
+* Second stage
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") replace
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+* Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") modify
+putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) 
+
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow ///
+	clear
+ds 
+
+drop if C == 0 // UPDATE 
+drop A 
+drop AI-BM // UPDATE
+
+
+mkmat *, matrix(Females_LW)
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_FemalesE") modify 
+putexcel B2 = matrix(Females_LW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_FemalesE") modify 
+
+local var_list L1_log_hourly_wage Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant InverseMillsRatio
+
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+
+* First stage
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow ///
+	clear
+ds 
+
+drop if AO == 0 // UPDATE
+drop A 
+drop C-AH // UPDATE
+drop BN // UPDATE
+
+
+mkmat *, matrix(Females_LW)
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_FemaleE") modify 
+putexcel B2 = matrix(Females_LW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_FemaleE") modify 
+	
+local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant 
+	
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+cap drop lambda
+
+
+* Calculate RMSE 
+cap drop residuals squared_residuals  
+gen residuals = lwage_hour - lwage_hour_hat
+gen squared_residuals = residuals^2
+
+preserve 
+keep if `filter'
+sum squared_residuals 
+di "RMSE for Employed women:  " sqrt(r(mean))
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A1=("REGRESSOR") B1=("COEFFICIENT") ///
+A4=("Wages_FemalesE") B4=(sqrt(r(mean))) 
+restore 
+
+
+****************************************************************************************************************************************
 * men
-global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth"
-global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1" 
-local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & swv > 1 & previouslyWorking"
+****************************************************************************************************************************************
+global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs  ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe
+global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe
+local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & previouslyWorking"
 *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust)
-heckman $wage_eqn if `filter', select($seln_eqn) twostep
+heckman $wage_eqn if `filter', select($seln_eqn) twostep 
 outputResults "Working men3"
 
-outreg2 stats(coef se pval) using "$dir_data/Output_WM.doc", replace ///
-title("Heckman-corrected wage equation estimated on the sample of men who were in employment in the previous year") ///
- ctitle(Wage equation coef.) label side dec(2) noparen 
+outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WM.doc", replace ///
+title("Heckman-corrected wage equation estimated on the sample of men who were in employment last year") ///
+ ctitle(Working women) label side dec(2) noparen 
+ 
+ 
+*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2)
+computePredicted "heckman" `filter'
+analyseFit "e(sample)" "nocorr" "Working men, 17-64 years" "WM"
+gen in_sample_mpw = e(sample)
+replace pred_hourly_wage = wage_hour_hat if in_sample_mpw
+
+* Save sample for later use (internal validation)
+save "$dir_validation_data/Male_PW_sample", replace 
 
+* Formatted results
+* Clean up matrix of estimates 
+* Note: Zeros values are eliminated 
+matrix b = e(b)	
+matrix V = e(V)
 
-computePredicted "heckman" `filter'
-analyseFit "e(sample)"
-replace esample = 1 if e(sample)
-replace pred_hourly_wage = wage_hour_hat if e(sample)
+* Store variance-covariance matrix 
+preserve
+
+putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace
+putexcel A1 = matrix(V)
+
+import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear
+
+describe
+local no_vars = `r(k)'	
+	
+forvalues i = 1/2 {
+	egen row_sum = rowtotal(*)
+	drop if row_sum == 0 
+	drop row_sum
+	xpose, clear	
+}	
+	
+mkmat v*, matrix(var)	
+
+* Second stage
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") replace
+putexcel C2 = matrix(var)
+		
+restore	
+
+* Store estimated coefficients 
+* Initialize a counter for non-zero coefficients
+local non_zero_count = 0
+//local names : colnames b
+
+* Loop through each element in `b` to count non-zero coefficients
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        local non_zero_count = `non_zero_count' + 1
+    }
+}
+
+* Create a new row vector to hold only non-zero coefficients
+matrix nonzero_b = J(1, `non_zero_count', .)
+
+* Populate nonzero_b with non-zero coefficients from b
+local index = 1
+forvalues i = 1/`no_vars' {
+    if (b[1, `i'] != 0) {
+        matrix nonzero_b[1, `index'] = b[1, `i']
+        local index = `index' + 1
+    }
+}
+
+putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") modify
+putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) 
+
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow ///
+	clear
+ds 
+
+drop if C == 0 // UPDATE 
+drop A 
+drop AI-BM // UPDATE
+
+
+mkmat *, matrix(Males_LW)
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_MalesE") modify 
+putexcel B2 = matrix(Males_LW)
+
+restore 
 
+* Labelling 
+putexcel set "$dir_results/reg_wages", ///
+	sheet("Wages_MalesE") modify 
+
+local var_list L1_log_hourly_wage Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other  Constant InverseMillsRatio
+
+	
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+
+* First stage
+preserve
+
+import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow ///
+	clear
+ds 
+
+drop if AO == 0 // UPDATE
+drop A 
+drop C-AH // UPDATE
+drop BN // UPDATE
+
+
+mkmat *, matrix(Males_LW)
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_MaleE") modify 
+putexcel B2 = matrix(Males_LW)
+
+restore 
+
+* Labelling 
+putexcel set "$dir_results/reg_employmentSelection", ///
+	sheet("EmploymentSelection_MaleE") modify 
+	
+local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag ///
+	Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs  ///
+	UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 ///
+	Ethn_Asian Ethn_Black Ethn_Other Constant 
+	
+
+putexcel A1 = "REGRESSOR"
+putexcel B1 = "COEFFICIENT"
+
+local i = 1 	
+foreach var in `var_list' {
+	local ++i
+	
+	putexcel A`i' = "`var'"
+	
+} 	
+
+local i = 2 	
+foreach var in `var_list' {
+    local ++i
+
+    if `i' <= 26 {
+        local letter = char(64 + `i')  // Convert 1=A, 2=B, ..., 26=Z
+        putexcel `letter'1 = "`var'"
+    }
+    else {
+        local first = char(64 + int((`i' - 1) / 26))  // First letter: A-Z
+        local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z
+        putexcel `first'`second'1 = "`var'"  // Correctly places AA-ZZ
+    }
+}
+
+cap drop lambda
+
+
+* Calculate RMSE 
+cap drop residuals squared_residuals  
+gen residuals = lwage_hour - lwage_hour_hat
+gen squared_residuals = residuals^2
+
+preserve 
+keep if `filter'
+sum squared_residuals 
+di "RMSE for Employed men:  " sqrt(r(mean))
+putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify
+putexcel A1=("REGRESSOR") B1=("COEFFICIENT") ///
+A5=("Wages_MalesE") B5=(sqrt(r(mean))) 
+restore 
+
+
+sum wage_hour if wage_hour >0& stm==19, d
+sum pred_hourly_wage if pred_hourly_wage >0& stm==19, d
+
+/*
+******************************************************************************************************************************************
 * all
 analyseFit "esample == 1"
 analyseFit "esample == 1 & dgn == 0"	// women
@@ -496,15 +1300,14 @@ forvalues year = 11/23 {
 	analyseFit2 "esample == 1 & dgn == 1 & deh_c3 == 1 & stm == `year'" "nocorr" "Year 20`year' men prv emp high ed" "men_highed_`year'_graph.png"	// men
 }
 
-
+*/
 
 // Note: sigma reported in the estimated regressions is the standard deviation of the residuals (=RMSE, assuming residuals are normally distributed)
 
 *** Save for use in the do file estimating non-employment income
 replace pred_hourly_wage = exp(lwage_hour) if missing(pred_hourly_wage)
 
-save "$dir_ukhls_data/ukhls_pooled_all_obs.dta", replace
-
+save "$dir_ukhls_data/ukhls_pooled_all_obs_10.dta", replace
 
 *** Calculate the proportion of "true zero" hours of work among those in the "ZERO" weekly hours of labour supply bracket. 
 *I.e. the share of zero hours among 0-5 hours for those at risk of work. 
diff --git a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do
new file mode 100644
index 000000000..bfaa412f1
--- /dev/null
+++ b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do
@@ -0,0 +1,321 @@
+
+xtset idperson swv
+
+* --------------------------------------------
+* 1. Handle Missing Values and Basic Setup
+* --------------------------------------------
+
+// Recode -9 as missing for all variables
+foreach var of varlist _all {
+    replace `var' = . if `var' == -9
+}
+
+// Sort data by individual and wave
+sort idperson swv
+
+// Recode year to two-digit format
+replace stm = stm - 2000
+
+// cap generate COVID year dummies
+cap cap gen y2020 = (stm == 20)
+cap cap gen y2021 = (stm == 21)
+
+
+* --------------------------------------------
+* 2. Correct Inconsistencies
+* --------------------------------------------
+
+// Fix inconsistent student coding
+replace ded = 0 if idperson == idperson[_n-1] & ded == 1 & ded[_n-1] == 0
+
+
+* --------------------------------------------
+* 3. Construct New Variables
+* --------------------------------------------
+
+// Partnership status in the first year
+cap cap gen new_rel = 0 if dcpst == 1
+replace new_rel = 1 if dcpen == 1
+label var new_rel "Partnerhip in first year"
+
+// Household type: 8 categories
+cap cap gen dhhtp_c8 = . 
+label var dhhtp_c8 "Household Type: 8 Category"
+replace dhhtp_c8 = 1 if dhhtp_c4 == 1 & lessp_c3 == 1
+replace dhhtp_c8 = 2 if dhhtp_c4 == 1 & lessp_c3 == 2
+replace dhhtp_c8 = 3 if dhhtp_c4 == 1 & lessp_c3 == 3	
+replace dhhtp_c8 = 4 if dhhtp_c4 == 2 & lessp_c3 == 1
+replace dhhtp_c8 = 5 if dhhtp_c4 == 2 & lessp_c3 == 2
+replace dhhtp_c8 = 6 if dhhtp_c4 == 2 & lessp_c3 == 3	
+replace dhhtp_c8 = 7 if dhhtp_c4 == 3
+replace dhhtp_c8 = 8 if dhhtp_c4 == 4
+cap label define dhhtp_c8 1 "Couple with no children, spouse employed" 2 "Couple with no children, spouse student" 3 "Couple with no children, spouse not employed" 4 "Couple with children, spouse employed" 5 "Couple with children, spouse student" 6 "Couple with children, spouse not employed" 7 "Single with no children" 8 "Single with children"
+label values dhhtp_c8 dhhtp_c8	
+
+// Squared income variable
+cap cap gen ypnbihs_dv_sq = ypnbihs_dv^2
+label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared"
+
+// Dummy for receiving capital income
+cap cap gen receives_ypncp = (ypncp > 0 & !missing(ypncp))
+
+// Transform capital income from IHS to level + log
+cap drop ypncp_lvl
+cap gen ypncp_lvl = sinh(ypncp)
+cap gen ln_ypncp = ln(ypncp_lvl)
+
+// Dummy and transformation for private pension income
+cap drop ypnoab_lvl
+cap gen ypnoab_lvl = sinh(ypnoab)
+cap cap gen ln_ypnoab = ln(ypnoab_lvl)
+cap cap gen receives_ypnoab = (ypnoab_lvl > 0 & !missing(ypnoab_lvl))
+
+// Dummy for state pension age
+cap cap gen state_pension_age = (dag >= 68)
+
+
+* --------------------------------------------------
+* 4. Lag Variables + Handle Missing Lags at Age 16
+* --------------------------------------------------
+
+// Create basic lags
+sort idperson swv
+cap cap gen l_ydses_c5 = ydses_c5[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dhe = dhe[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_les_c3 = les_c3[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_lesnr_c2 = lesnr_c2[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dhhtp_c4 = dhhtp_c4[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dhe_pcs = dhe_pcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dhe_mcs = dhe_mcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dlltsd = dlltsd[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+cap cap gen l_dlltsd01 = dlltsd01[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 
+
+// Fill in missing lags using current values at age 16
+gsort +idperson -stm
+bys idperson: carryforward dhe if dag <= 16, replace 
+bys idperson: carryforward dhe_pcs if dag <= 16, replace 
+bys idperson: carryforward dhe_mcs if dag <= 16, replace 
+
+sort idperson swv
+cap drop dhe_L1
+bys idperson: gen dhe_L1 = l.dhe
+replace dhe_L1 = dhe if missing(dhe_L1)
+
+cap drop dhe_pcs_L1
+bys idperson: gen dhe_pcs_L1 = l.dhe_pcs
+replace dhe_pcs_L1 = dhe_pcs if missing(dhe_pcs_L1)
+
+cap drop dhe_mcs_L1
+bys idperson: gen dhe_mcs_L1 = l.dhe_mcs
+replace dhe_mcs_L1 = dhe if missing(dhe_mcs_L1)
+
+cap drop yplgrs_dv_L1
+bys idperson: gen yplgrs_dv_L1 = l.yplgrs_dv
+replace yplgrs_dv_L1 = yplgrs_dv if missing(yplgrs_dv_L1)
+
+cap drop yplgrs_dv_L2
+bys idperson: gen yplgrs_dv_L2 = l2.yplgrs_dv
+replace yplgrs_dv_L2 = yplgrs_dv if missing(yplgrs_dv_L2)
+
+cap drop ypncp_L1
+bys idperson: gen ypncp_L1 = l.ypncp
+replace ypncp_L1 = ypncp if missing(ypncp_L1)
+
+cap drop ypncp_L2
+bys idperson: gen ypncp_L2 = l2.ypncp
+replace ypncp_L2 = ypncp if missing(ypncp_L2)
+
+cap drop ypnoab_L1
+bys idperson: gen ypnoab_L1 = l.ypnoab
+replace ypnoab_L1 = ypnoab if missing(ypnoab_L1)
+
+cap drop ypnoab_L2
+bys idperson: gen ypnoab_L2 = l2.ypnoab
+replace ypnoab_L2 = ypnoab if missing(ypnoab_L2)
+
+cap drop dhhtp_c4_L1
+bys idperson: gen dhhtp_c4_L1 = l.dhhtp_c4
+replace dhhtp_c4_L1 = dhhtp_c4 if missing(dhhtp_c4_L1)
+
+cap drop les_c3_L1
+bys idperson: gen les_c3_L1 = l.les_c3
+replace les_c3_L1 = les_c3 if missing(les_c3_L1)
+
+
+* --------------------------------------------------
+* 4. Labelling 
+* --------------------------------------------------
+
+* Label definitions
+cap label define jbf 1 "Employed" 2 "Student" 3 "Not Employed"
+cap label define jbg 1 "Employed" 2 "Student" 3 "Not employed" 4 "Retired"
+cap label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification"
+cap label define hht 1 "Couples with No Children" 2 "Couples with Children" 3 "Single with No Children" 4 "Single with Children"
+cap label define gdr 1 "Male" 0 "Female"
+cap label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" 6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" 12 "Scotland" 13 "Northern Ireland"
+cap label define yn 1 "Yes" 0 "No"
+cap label define dces 1 "Both Employed" 2 "Employed, Spouse Not Employed" 3 "Not Employed, Spouse Employed" 4 "Both Not Employed"
+cap label define ethn 1 "White" 2 "Asian or Asian British" 3 "Black, Black British, Caribbean, or African" 4 "Other or missing ethnic group"
+cap label define dhe 1 "Poor" 2 "Fair" 3 "Good" 4 "Very Good" 5 "Excellent", modify 
+
+* Variable labels
+label variable dgn "cap gender"
+label variable dag "Age"
+label variable dagsq "Age Squared"
+label variable drgn1 "Region"
+label variable stm "Year"
+label variable les_c3 "Employment Status: 3 Category"
+label variable les_c4 "Employment Status: 4 Category"
+label variable dhe "Self-rated Health"
+label variable dcpen "Entered a new Partnership"
+label variable dcpex "Partnership dissolution"
+label variable deh_c3 "Educational Attainment: 3 Category"
+label variable ydses_c5 "Annual Household Income Quintile"
+label variable dlltsd "Long-term Sick or Disabled"
+label variable dhhtp_c4 "Household Type: 4 Category"
+label variable dhhtp_c8 "Household Type: 8 Category"
+label variable dnc "Number of Children in Household"
+label variable dnc02 "Number of Children aged 0-2 in Household"
+label variable dot "Ethnicity"
+label variable dehmf_c3 "Highest Parental Educational Attainment: 3 Category"
+label variable dhe_mcs "Subjective Self-rated health - Mental (SF12 MCS)"
+label variable dhe_pcs "Subjective Self-rated health - Physical (SF12 PCS)"
+label variable dagpns "Reached state retirement age"
+label variable dagpns_sp "Reached state retirement age - partner"
+label variable dukfr "UK Fertility Rate"
+label variable lesdf_c4 "Differential Employment Status"
+label variable ypnbihs_dv "Personal Non-benefit Gross Income"
+label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income"
+
+* Attach value labels to variables
+label values dgn gdr
+label values drgn1 rgna
+label values les_c3 lessp_c3 jbf 
+label values les_c4 jbg 
+label values deh_c3 dehsp_c3 edd 
+label values dcpen dcpex yn
+label values lesdf_c4 dces
+label values dhhtp_c4 hht 
+label values dhhtp_c8 dhhtp_c8
+label values dot ethn 
+label values dhe dhe
+label value ded yn
+label value dlltsd yn
+label value dlltsd01 yn
+
+* Alter names and create dummies for automatic labelling 
+*(required for gologit) 
+
+cap gen Dgn = dgn 
+cap gen Dag = dag  
+cap gen Dag_sq = dagsq 
+
+
+capture drop UK*
+capture drop Deh_c3_*
+capture drop Dehmf_c3_*
+capture drop Les_c4_*
+capture drop L_Les_c3_*
+capture drop Ydses_c5_Q*
+capture drop L_Ydses_c5_Q*
+capture drop Dhe_*
+capture drop L_Dhe_c5_*
+capture drop Dhhtp_c4_*
+capture drop L_Dhhtp_c4_*
+capture drop dot_*
+cap drop Ethn_White Ethn_Asian Ethn_Black Ethn_Other
+
+tab drgn1, gen(UK) 
+rename UK1 UKC //North East
+rename UK2 UKD //North West
+rename UK3 UKE //Yorkshire and the Humber
+rename UK4 UKF //East Midlands
+rename UK5 UKG //West Midlands
+rename UK6 UKH //East of England
+rename UK7 UKI //London
+rename UK8 UKJ //South East
+rename UK9 UKK //South West
+rename UK10 UKL //Wales
+rename UK11 UKM //Scotland
+rename UK12 UKN //Northern Ireland
+
+tab deh_c3, gen(Deh_c3_)
+rename Deh_c3_1 Deh_c3_High
+rename Deh_c3_2 Deh_c3_Medium
+rename Deh_c3_3 Deh_c3_Low
+
+tab dehmf_c3, gen(Dehmf_c3_)
+rename Dehmf_c3_1 Dehmf_c3_High
+rename Dehmf_c3_2 Dehmf_c3_Medium
+rename Dehmf_c3_3 Dehmf_c3_Low
+
+tab les_c4, gen(Les_c4_)
+rename Les_c4_1 Les_c4_Employed
+rename Les_c4_2 Les_c4_Student
+rename Les_c4_3 Les_c4_NotEmployed
+rename Les_c4_4 Les_c4_Retired
+
+tab l_les_c3, gen(L_Les_c3_)
+rename L_Les_c3_1 L_Les_c3_Employed
+rename L_Les_c3_2 L_Les_c3_Student
+rename L_Les_c3_3 L_Les_c3_NotEmployed
+
+tab ydses_c5, gen(Ydses_c5_Q)
+
+tab l_ydses_c5, gen(L_Ydses_c5_Q)
+
+tab dhe, gen(Dhe_)
+rename Dhe_1 Dhe_Poor
+rename Dhe_2 Dhe_Fair
+rename Dhe_3 Dhe_Good
+rename Dhe_4 Dhe_VeryGood
+rename Dhe_5 Dhe_Excellent
+
+tab l_dhe, gen(L_Dhe_c5_)
+
+tab dhhtp_c4, gen(Dhhtp_c4_)
+rename Dhhtp_c4_1 Dhhtp_c4_CoupleNoChildren
+rename Dhhtp_c4_2 Dhhtp_c4_CoupleChildren
+rename Dhhtp_c4_3 Dhhtp_c4_SingleNoChildren
+rename Dhhtp_c4_4 Dhhtp_c4_SingleChildren
+
+tab l_dhhtp_c4, gen(L_Dhhtp_c4_)
+rename L_Dhhtp_c4_1 L_Dhhtp_c4_CoupleNoChildren
+rename L_Dhhtp_c4_2 L_Dhhtp_c4_CoupleChildren
+rename L_Dhhtp_c4_3 L_Dhhtp_c4_SingleNoChildren
+rename L_Dhhtp_c4_4 L_Dhhtp_c4_SingleChildren
+
+tab dot, gen(dot_)
+rename dot_1 Ethn_White
+rename dot_2 Ethn_Asian
+rename dot_3 Ethn_Black
+rename dot_4 Ethn_Other
+
+
+
+
+cap gen Year_transformed = stm  
+
+cap gen Y2020 = y2020
+cap gen Y2021 = y2021
+
+cap gen Dhe = dhe 
+cap gen Dhe_pcs = dhe_pcs
+cap gen Dhe_mcs = dhe_mcs
+
+cap gen Ydses_c5 = ydses_c5 
+
+cap gen L_Ydses_c5 = l_ydses_c5
+
+cap gen L_Dhe = l_dhe
+cap gen L_Dhe_pcs = l_dhe_pcs
+cap gen L_Dhe_mcs = l_dhe_mcs
+
+cap gen Dlltsd = dlltsd
+cap gen Dlltsd01 = dlltsd01
+
+cap gen L_Dlltsd = l_dlltsd
+cap gen L_Dlltsd01 = l_dlltsd01
+
+
+
diff --git a/input/reg_RMSE.xlsx b/input/reg_RMSE.xlsx
index 8d9d984a4..db4b32b2c 100644
Binary files a/input/reg_RMSE.xlsx and b/input/reg_RMSE.xlsx differ
diff --git a/input/reg_education.xlsx b/input/reg_education.xlsx
index c4c96c2c4..6b9f12acd 100644
Binary files a/input/reg_education.xlsx and b/input/reg_education.xlsx differ
diff --git a/input/reg_employmentSelection.xlsx b/input/reg_employmentSelection.xlsx
index e3734b6ac..82e206187 100644
Binary files a/input/reg_employmentSelection.xlsx and b/input/reg_employmentSelection.xlsx differ
diff --git a/input/reg_fertility.xlsx b/input/reg_fertility.xlsx
index f9dac777c..4c9eb0fc6 100644
Binary files a/input/reg_fertility.xlsx and b/input/reg_fertility.xlsx differ
diff --git a/input/reg_health.xlsx b/input/reg_health.xlsx
index 74994f26e..a487187ce 100644
Binary files a/input/reg_health.xlsx and b/input/reg_health.xlsx differ
diff --git a/input/reg_home_ownership.xlsx b/input/reg_home_ownership.xlsx
index 4803ba8a7..f09869245 100644
Binary files a/input/reg_home_ownership.xlsx and b/input/reg_home_ownership.xlsx differ
diff --git a/input/reg_income.xlsx b/input/reg_income.xlsx
index f10d898aa..f0a7aac8a 100644
Binary files a/input/reg_income.xlsx and b/input/reg_income.xlsx differ
diff --git a/input/reg_leave_parental_home.xlsx b/input/reg_leave_parental_home.xlsx
new file mode 100644
index 000000000..289873ed9
Binary files /dev/null and b/input/reg_leave_parental_home.xlsx differ
diff --git a/input/reg_partnership.xlsx b/input/reg_partnership.xlsx
index 9c689b853..8eac01b71 100644
Binary files a/input/reg_partnership.xlsx and b/input/reg_partnership.xlsx differ
diff --git a/input/reg_retirement.xlsx b/input/reg_retirement.xlsx
index 08663ae2b..0298e09b7 100644
Binary files a/input/reg_retirement.xlsx and b/input/reg_retirement.xlsx differ
diff --git a/input/reg_wages.xlsx b/input/reg_wages.xlsx
index 0fdacc128..326c030a4 100644
Binary files a/input/reg_wages.xlsx and b/input/reg_wages.xlsx differ
diff --git a/input/scenario_parametricMatching.xlsx b/input/scenario_parametricMatching.xlsx
index 70899584f..ec873c5ea 100644
Binary files a/input/scenario_parametricMatching.xlsx and b/input/scenario_parametricMatching.xlsx differ