diff --git a/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx b/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx new file mode 100644 index 000000000..0402f725d Binary files /dev/null and b/input/InitialPopulations/SimPaths - lag strucure - UK.xlsx differ diff --git a/input/InitialPopulations/compile/00_master.do b/input/InitialPopulations/compile/00_master.do index 8717fdb0a..b86f0a953 100644 --- a/input/InitialPopulations/compile/00_master.do +++ b/input/InitialPopulations/compile/00_master.do @@ -8,7 +8,7 @@ * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * WAS EUL version - UKDA-7215-stata [to wave 7] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 30 Apr 2025 +* LAST UPDATE: 18 July 2025 DP *************************************************************************************** *************************************************************************************** @@ -37,11 +37,12 @@ set matsize 1000 * Working directory *global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations" -global dir_work "C:\Users\Patryk\Documents\SP_prep_pop" +*global dir_work "C:\Users\Patryk\Documents\SP_prep_pop" +global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations" * Directory which contains do files -*global dir_do "${dir_work}/do" -global dir_do "C:\Users\Patryk\git\SimPathsFork\input\InitialPopulations\compile" +global dir_do "${dir_work}/do" +*global dir_do "C:\Users\Patryk\git\SimPathsFork\input\InitialPopulations\compile" * Directory which contains data files global dir_data "${dir_work}/data" @@ -51,13 +52,13 @@ global dir_log "${dir_work}/log" * Directory which contains UKHLS data *global dir_ukhls_data "J:\01 DATA\UK\ukhls\wave14\stata\stata13_se\ukhls" -//global dir_ukhls_data "D:\Dasha\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls" -global dir_ukhls_data "C:\Users\Patryk\Documents\SP_prep_pop\ukhls\UKDA-6614-stata\stata\stata13_se\ukhls" +global dir_ukhls_data "D:\Dasha\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls" +*global dir_ukhls_data "C:\Users\Patryk\Documents\SP_prep_pop\ukhls\UKDA-6614-stata\stata\stata13_se\ukhls" * Directory which contains WAS data *global dir_was_data "J:\01 DATA\UK\was\wave7\stata\stata13_se" -//global dir_was_data "D:\Dasha\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se" -global dir_was_data "C:\Users\Patryk\Documents\WAS\UKDA-7215-stata\stata\stata13_se" +global dir_was_data "D:\Dasha\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se" +*global dir_was_data "C:\Users\Patryk\Documents\WAS\UKDA-7215-stata\stata\stata13_se" * Directory which contains original initial popultions global dir_ipop_orig "${dir_work}/original_initial_populations" @@ -112,12 +113,12 @@ do "${dir_do}/04_social_care_provided.do" do "${dir_do}/05_create_benefit_units.do" * reweight data and slice into yearly segments do "${dir_do}/06_reweight_and_slice.do" -/* impute wealth data for selected years +* impute wealth data for selected years do "${dir_do}/07_was_wealth_data.do" forvalues year = $wealthStartYear / $wealthEndYear { global yearWealth = `year' do "${dir_do}/08_wealth_to_ukhls.do" -}*/ +} do "${dir_do}/09_finalise_input_data.do" do "${dir_do}/10_check_yearly_data.do" diff --git a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do index aa0f5efba..9a9162c1f 100644 --- a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do +++ b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 14 Jan 2025 DP +* LAST UPDATE: 18 July 2025 DP * NOTE: Called from 00_master.do - see master file for further details *************************************************************************************** @@ -167,9 +167,28 @@ gen inc_tu = frmnthimp_dv if ficode == 25 //Trade Union / Friendly Society Payme gen inc_ma = frmnthimp_dv if ficode == 26 //Maintenance or Alimony gen inc_fm = frmnthimp_dv if ficode == 27 //payments from a family member not living here gen inc_oth = frmnthimp_dv if ficode == 38 //any other regular payment (not asked in Wave 1) -keep swv pidp hidp inc_pp inc_tu inc_ma inc_fm inc_oth -drop if missing(inc_pp) & missing(inc_tu) & missing(inc_ma) & missing(inc_fm) & missing(inc_oth) -collapse (sum) inc_pp inc_tu inc_ma inc_fm inc_oth, by(swv pidp hidp) +/* +8 Severe Disablement Allowance +9 Industrial Injury Disablement Allowance +10 Disability Living Allowance +11 Attendance Allowance +12 Carer's Allowance (formerly Invalid Care Allowance) +13 War Disablement Pension +14 Incapacity Benefit +33 Employment and Support Allowance +34 Return to Work Credit +35 Sickness and Accident Insurance +37 Other Disability Related Benefit or Payment +41 Personal Independence Payments +43 Child Disability Payment +44 Adult Disability Payment +45 Pension Age Disability Payment +*/ +gen inc_disab = frmnthimp_dv if (ficode>=8 & ficode<=14) | ficode==33 | ficode==34 | ficode==35 | ficode==37 | ficode==41 | ficode==43 | ficode==44 | ficode==45 + +keep swv pidp hidp inc_pp inc_tu inc_ma inc_fm inc_oth inc_disab +drop if missing(inc_pp) & missing(inc_tu) & missing(inc_ma) & missing(inc_fm) & missing(inc_oth) & missing(inc_disab) +collapse (sum) inc_pp inc_tu inc_ma inc_fm inc_oth inc_disab, by(swv pidp hidp) save "$dir_data\tmp_income", replace restore diff --git a/input/InitialPopulations/compile/02_create_UKHLS_variables.do b/input/InitialPopulations/compile/02_create_UKHLS_variables.do index b066757ce..64589f2f2 100644 --- a/input/InitialPopulations/compile/02_create_UKHLS_variables.do +++ b/input/InitialPopulations/compile/02_create_UKHLS_variables.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 14 Jan 2025 DP +* LAST UPDATE: 18 July 2025 DP * NOTE: Called from 00_master.do - see master file for further details * Use -9 for missing values *************************************************************************************** @@ -430,6 +430,27 @@ gen dhe_pcs_flag = missing(dhe_pcs) replace dhe_pcs = round(dhe_pcs_prediction) if missing(dhe_pcs) bys dhe_pcs_flag : sum dhe_pcs + +/************Partner's Self-rated health health - mental and physical component***************/ +preserve +keep swv idperson dhe_mcs dhe_pcs +rename idperson idpartner +rename dhe_mcs dhe_mcssp +rename dhe_pcs dhe_pcssp + +save "$dir_data/temp_dhe", replace +restore + +merge m:1 swv idpartner using "$dir_data/temp_dhe" +la var dhe_mcssp "Partner's Self-rated health health - mental component" +la var dhe_pcssp "Partner's Self-rated health health - physical component" +keep if _merge == 1 | _merge == 3 +drop _merge +replace dhe_mcssp=-9 if missing(dhe_mcssp) & idpartner>0 +replace dhe_pcssp=-9 if missing(dhe_pcssp) & idpartner>0 +//fre dhe_mcssp dhe_pcssp if idpartner>0 + + /***************************** Life Satisfaction ***************************************************************************/ /* Life satisfaction, self report. Continuous scale 0 to 7. */ @@ -475,20 +496,53 @@ ethn_dv -- Ethnic group (derived from multiple sources) 17 arab 97 any other ethnic group */ -*Note: Missing ethnic group is combined with "Other" +*ONS style definition (but missing is kept as a separate category) +cap gen dot01 = . +replace dot01 = 1 if ethn_dv>=1 & ethn_dv <=4 //white// +replace dot01 = 2 if ethn_dv>=5 & ethn_dv<=8 //mixed // +replace dot01 = 3 if ethn_dv>=9 & ethn_dv<=13 //asian// +replace dot01 = 4 if ethn_dv>=14 & ethn_dv<=16 //black// +replace dot01 = 5 if ethn_dv==17 | ethn_dv==97 //other, arab// +replace dot01 = 6 if ethn_dv==-9 //missing// +lab var dot01 "Ethnicity" +cap label define dot01 1 "White" 2 "Mixed or Multiple ethnic groups" 3 "Asian or Asian British" 4 "Black, Black British, Caribbean, or African" 5 "Other ethnic group" 6 "Missing" +label values dot01 dot01 +//fre dot01 + +/************Partner's ethnicity***************/ +preserve +keep swv idperson dot01 +rename idperson idpartner +rename dot01 dot01_sp +save "$dir_data/temp_dot01", replace +restore + +merge m:1 swv idpartner using "$dir_data/temp_dot01" +la var dot01_sp "Partner's Ethnicity (6 cat)" +keep if _merge == 1 | _merge == 3 +drop _merge +replace dot01_sp=6 if missing(dot01_sp) & idpartner>0 +replace dot01_sp=-69 if missing(dot01_sp) +fre dot01_sp + +//impute missing status of a respondent by spouses status if not missing +fre dot01_sp if dot01==6 +replace dot01=dot01_sp if dot01==6 & (dot01_sp>=1 & dot01_sp<=5) //(9,499 real changes made) out of 21914 = 43% of missing is imputed by partner's ethnicity + + +* Ethnicity definition used in regression estimates cap gen dot = . -replace dot = 1 if ethn_dv>=1 & ethn_dv <=4 //white// -replace dot = 2 if ethn_dv>=5 & ethn_dv<=8 //mixed // -replace dot = 3 if ethn_dv>=9 & ethn_dv<=13 //asian// -replace dot = 4 if ethn_dv>=14 & ethn_dv<=16 //black// -replace dot = 5 if ethn_dv==17 | ethn_dv==97 //other, arab// -replace dot = 5 if ethn_dv==-9 //missing// -lab var dot "DEMOGRAPHIC: Ethnicity" -cap label define dot -9 "missing" 1 "White" 2 "Mixed or Multiple ethnic groups" 3 "Asian or Asian British" 4 "Black, Black British, Caribbean, or African" 5 "Other or missing ethnic group" +replace dot = 1 if ethn_dv>=1 & ethn_dv <=4 //white// +replace dot = 2 if ethn_dv>=9 & ethn_dv<=13 //asian// +replace dot = 3 if ethn_dv>=14 & ethn_dv<=16 //black// +replace dot = 4 if ethn_dv==17 | ethn_dv==97 | ethn_dv==-9 | (ethn_dv>=5 & ethn_dv<=8) //arab, mixed, other and missing +lab var dot "Ethnicity" +cap label define dot 1 "White" 2 "Asian or Asian British" 3 "Black, Black British, Caribbean, or African" 4 "Other or missing ethnic group" label values dot dot //fre dot + /******************************Education status*******************************/ *Use hiqual variable, code negative values to missing *Low education: Other qualification, no qualification @@ -785,7 +839,9 @@ recode jshrs (-9/-1 . = .) //lhw is the sum of the above, but don't want to take -9 into account. Recode into missing value. egen lhw=rowtotal(jbhrs jbot jshrs) replace lhw = ceil(lhw) -la var lhw "Hours worked per week" +la var lhw "Hours worked per week (capped at 126)" +replace lhw = 126 if lhw > 126 //ensure lhw doesn't go above weekly max 168 minus 6*7 hours of sleep. +//(37 real changes made) //fre lhw // Lag(1) of hours of work @@ -867,16 +923,55 @@ replace dlltsd = 1 if missing(jbstat) & l.jbstat == 8 la var dlltsd "DEMOGRAPHIC: LT sick or disabled" +//check if in receipt of disability benefits +/* +fre bendis1 //Income: Disability benefits: Incapacity Benefit +fre bendis2 //Income: Disability benefits: Employment and Support Allowance +fre bendis3 //Income: Disability benefits: Severe Disablement Allowance +fre bendis4 //Income: Disability benefits: Carer's Allowance +fre bendis5 //Income: Disability benefits: Disability Living Allowance +fre bendis6 //Income: Disability benefits: Return to work credit +fre bendis7 //Income: Disability benefits: Attendance Allowance +fre bendis8 //Income: Disability benefits: Industrial Injury Disablement Benefit +fre bendis9 //Income: Disability benefits: War disablement pension +fre bendis10 //Income: Disability benefits: Sickness and Accident Insurance +fre bendis11 //Income: Disability benefits: Universal Credit +fre bendis12 //Income: Disability benefits: Personal Independence Payments +fre bendis13 //Income: Disability benefits: Child Disability Payment +fre bendis14 //Income: Disability benefits: Adult Disability Payment +fre bendis15 //Income: Disability benefits: Pension Age Disability Payment +fre bendis97 //Income: Disability benefits: Any other disability related benefit or payment +*/ +gen disben = 0 +replace disben = 1 if inlist(1, bendis1, bendis2, bendis3, bendis4, bendis5, bendis6, bendis7, bendis8, bendis9, /// + bendis10, bendis12, bendis13, bendis14, bendis15) +/*Note: exclude bendis11 (Universal credit) as it can be jointly received and bendis97 (any other) +bysort swv idhh (idhh): gen hhsize = _N +tab2 hhsize disben +tab2 dlltsd disben */ + +//second check: disability income based on ficode (disability income is computed in 01_prepare_ukhls_pooled_data) +gen disben2 = (inc_disab>0 & inc_disab<.) + +//select those who report being disabled & in receipt of disability benefits according to both checks +gen dlltsd01 = (dlltsd==1 | (disben==1 & disben2==1)) +la var dlltsd01 "DEMOGRAPHIC: LT sick/disabled or receives disability benefits" +//fre dlltsd01 +//tab2 dlltsd01 dlltsd + + /*******************Long-term sick or disabled - spouse ***********************/ preserve -keep swv idperson dlltsd +keep swv idperson dlltsd dlltsd01 rename idperson idpartner rename dlltsd dlltsd_sp +rename dlltsd01 dlltsd01_sp save "$dir_data/temp_dlltsd", replace restore merge m:1 swv idpartner using "$dir_data/temp_dlltsd" -la var dlltsd_sp "Partner's long-term sick" +la var dlltsd_sp "Partner's long-term sick/disabled" +la var dlltsd01_sp "Partner's long-term sick/disabled or receives disability benefits" keep if _merge == 1 | _merge == 3 drop _merge //fre dlltsd_sp @@ -1505,25 +1600,28 @@ replace dwt = 0 if missing(dwt) /***************************Keep required variables***************************/ keep ivfio idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc dgn dgnsp dag dagsq dhe dhesp dcpst /// - ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dcpagdf dlltsd dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq dimlwt disclwt /// + ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dcpagdf dlltsd dlltsd01 dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq dimlwt disclwt /// dimxwt dhhwt jbhrs jshrs j2hrs jbstat les_c3 les_c4 lessp_c3 lessp_c4 lesdf_c4 ydses_c5 month scghq2_dv /// ypnbihs_dv yptciihs_dv yplgrs_dv ynbcpdf_dv ypncp ypnoab swv sedex ssscp sprfm sedag stm dagsp lhw l1_lhw pno ppno hgbioad1 hgbioad2 der adultchildflag /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - sedcsmpl sedrsmpl scedsmpl dhh_owned dukfr dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp ypnoab_lvl *_flag Int_Date dhe_mcs dhe_pcs dls dot unemp financial_distress + sedcsmpl sedrsmpl scedsmpl dhh_owned dukfr dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp dlltsd01_sp ypnoab_lvl *_flag Int_Date dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress sort swv idhh idperson /**************************Recode missing values*******************************/ foreach var in idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc dgn dgnsp dag dagsq dhe dhesp dcpst /// - ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dlltsd dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq /// + ded deh_c3 der dehsp_c3 dehm_c3 dehf_c3 dehmf_c3 dcpen dcpyy dcpex dlltsd dlltsd01 dlrtrd drtren dlftphm dhhtp_c4 dhm dhm_ghq /// jbhrs jshrs j2hrs jbstat les_c3 les_c4 lessp_c3 lessp_c4 lesdf_c4 ydses_c5 scghq2_dv /// ypnbihs_dv yptciihs_dv yplgrs_dv swv sedex ssscp sprfm sedag stm dagsp lhw l1_lhw pno ppno hgbioad1 hgbioad2 der dhh_owned /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - scghq2_dv_miss_flag dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp ypnoab_lvl *_flag dhe_mcs dhe_pcs dls dot unemp { + scghq2_dv_miss_flag dchpd dagpns dagpns_sp CPI lesnr_c2 dlltsd_sp dlltsd01_sp ypnoab_lvl *_flag dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp { qui recode `var' (-9/-1=-9) (.=-9) } + + + *recode missings in weights to zero. foreach var in dimlwt disclwt dimxwt dhhwt { qui recode `var' (.=0) (-9/-1=0) @@ -1583,6 +1681,8 @@ local files_to_drop temp_mother_dag.dta temp_ypnb.dta tmp_partnershipDuration.dta + temp_dot01.dta + ; #delimit cr // cr stands for carriage return diff --git a/input/InitialPopulations/compile/03_social_care_received.do b/input/InitialPopulations/compile/03_social_care_received.do index 101f3a31e..a6df77c2a 100644 --- a/input/InitialPopulations/compile/03_social_care_received.do +++ b/input/InitialPopulations/compile/03_social_care_received.do @@ -3,7 +3,7 @@ * FILE TO EXTRACT UKHLS DATA FOR SOCIAL CARE RECEIPT TO INCLUDE IN INITIAL POPULATION * * AUTH: Justin van de Ven (JV) -* LAST EDIT: Daria Popova +* LAST EDIT: 18 July 2025 DP * *******************************************************************************/ diff --git a/input/InitialPopulations/compile/04_social_care_provided.do b/input/InitialPopulations/compile/04_social_care_provided.do index 152984d16..a3997f66d 100644 --- a/input/InitialPopulations/compile/04_social_care_provided.do +++ b/input/InitialPopulations/compile/04_social_care_provided.do @@ -3,9 +3,9 @@ * FILE TO EXTRACT UKHLS DATA FOR SOCIAL CARE PROVISION TO INCLUDE IN INITIAL POPULATION * * AUTH: Justin van de Ven (JV) -* LAST EDIT: Daria Popova +* LAST EDIT: 18 July 2025 DP * -*******************************************************************************/ +********************************************************************************/ *************************************************************************************** cap log close @@ -13,12 +13,12 @@ log using "${dir_log}/04_social_care_provided.log", replace *************************************************************************************** /******************************************************************************** local data directories - commented out when using master program -*******************************************************************************/ +********************************************************************************/ /********************************************************************** * start analysis -*********************************************************************/ +**********************************************************************/ cd "${dir_data}" disp "identifying social care provision" @@ -66,7 +66,7 @@ save "${dir_data}/ukhls_scprov_pooled0.dta", replace /************************************************************************************** * process variables -*************************************************************************************/ +**************************************************************************************/ use "ukhls_scprov_pooled0.dta", clear // provision of care @@ -123,22 +123,39 @@ label define careWho 1 "partner only" 2 "partner and non-partner" 3 "non-partner keep pidp swv careWho aidhrs_adj rename aidhrs_adj aidhrs + + + rename pidp idperson save "ukhls_scprov_pooled1.dta", replace /************************************************************************************** * merge with main data set -*************************************************************************************/ +**************************************************************************************/ disp "merge results with existing data" use "UKHLS_pooled_all_obs_03.dta", clear merge 1:1 idperson swv using ukhls_scprov_pooled1, keep(1 3) nogen -foreach var of varlist careWho aidhrs { +foreach var of varlist careWho /*aidhrs*/ { replace `var' = -9 if (missing(`var')) } +recode aidhrs (.=0) + +//Add variable for capped care hours provided (as used in new labour supply estimates) +cap gen max_possible_aidhrs = 168 - lhw - 42 //subrtact work and sleep time +fre max_possible_aidhrs +gen aidhrs_excess = (aidhrs - max_possible_aidhrs) if aidhrs > max_possible_aidhrs +list aidhrs lhw max_possible_aidhrs aidhrs_excess if aidhrs_excess <. +count if aidhrs_excess <. +gen careHoursProvidedWeekly = aidhrs +replace careHoursProvidedWeekly = max_possible_aidhrs if aidhrs > max_possible_aidhrs & aidhrs <. +assert careHoursProvidedWeekly <= max_possible_aidhrs +assert lhw+careHoursProvidedWeekly+42 <=168 +lab var careHoursProvidedWeekly "Weekly hours of care provided (capped)" +fre careHoursProvidedWeekly sort idperson swv save "ukhls_pooled_all_obs_04.dta", replace @@ -146,7 +163,7 @@ save "ukhls_pooled_all_obs_04.dta", replace cap log close /************************************************************************************** * clean-up and exit -*************************************************************************************/ +**************************************************************************************/ #delimit ; local files_to_drop int_temp.dta @@ -167,5 +184,3 @@ foreach file of local files_to_drop { erase "$dir_data/`file'" } - - diff --git a/input/InitialPopulations/compile/05_create_benefit_units.do b/input/InitialPopulations/compile/05_create_benefit_units.do index 7e4eac830..932e33aef 100644 --- a/input/InitialPopulations/compile/05_create_benefit_units.do +++ b/input/InitialPopulations/compile/05_create_benefit_units.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 15 Jan 2025 DP +* LAST UPDATE: 18 July 2025 DP * NOTE: Called from 00_master.do - see master file for further details *************************************************************************************** @@ -17,12 +17,12 @@ log using "${dir_log}/05_drop_hholds_create_benefit_units.log", replace ******************************************************************************** use "$dir_data\UKHLS_pooled_all_obs_04.dta", clear -/******************************************************************************/ +/*******************************************************************************/ fre ivfio keep if ivfio == 1 | ivfio == 2 | ivfio == 21 | ivfio == 24 fre ivfio -/******************************Split households*******************************/ +/******************************Split households********************************/ *DP: This procedure is revised following the approach taken for the EU-SILC based models /**********************Rules and assumptions*********************************** 1. Each HH can contain: Responsible Male, and/or Responsible Female, Children, Other members. @@ -501,3 +501,17 @@ drop if stm<0 save "$dir_data\ukhls_pooled_all_obs_05.dta", replace cap log close +/************************************************************************************** +* clean-up and exit +**************************************************************************************/ +#delimit ; +local files_to_drop + fatherinfo.dta + motherinfo.dta + orphans.dta + ; +#delimit cr // cr stands for carriage return + +foreach file of local files_to_drop { + erase "$dir_data/`file'" +} diff --git a/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do b/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do deleted file mode 100644 index e901b07ee..000000000 --- a/input/InitialPopulations/compile/05_drop_hholds_create_benefit_units.do +++ /dev/null @@ -1,433 +0,0 @@ -*************************************************************************************** -* PROJECT: ESPON: construct initial populations for SimPaths using UKHLS data -* DO-FILE NAME: 05_drop_hholds_create_ukhls_yearly_data.do -* DESCRIPTION: Screens data and identifies benefit units -*************************************************************************************** -* COUNTRY: UK -* DATA: UKHLS EUL version - UKDA-6614-stata [to wave m] -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 10 Apr 2024 (JV) -* NOTE: Called from 00_master.do - see master file for further details -*************************************************************************************** - - -******************************************************************************** -cap log close -log using "${dir_log}/05_drop_hholds.log", replace -******************************************************************************** - -use "$dir_data\UKHLS_pooled_all_obs_04.dta", clear -/******************************************************************************/ -fre ivfio -keep if ivfio == 1 | ivfio == 2 | ivfio == 21 | ivfio == 24 -fre ivfio -//(88,338 observations deleted) -/******************************Split households*******************************/ - -*DP: script from "Data management replication file" -/**********************Rules and assumptions*********************************** -1. Each HH can contain: Responsible Male, and/or Responsible Female, Children, Other members. -In the simulation everyone starts as "Other member" and is assigned one of the roles in the HH. - - 1.1. Responsible male and female create a partnership couple leading the HH. Any additional couple - creates new HH. A couple with / composed of people under the age to leave home (18) - will still leave together and set up a new HH. - - 1.1.1. Children should follow the mother if she's moving to a new HH. - - 1.2. After the above there should be only singles left in addition to the leading couple. - If they are above 18, they will leave and set up their own HH. - 1.3. After the above there should only be children left in addition to the original HH. - Children will live with mother if defined in the data, otherwise with father. If neither - exists, they will be considered as orphans. - 1.4. Orphans are assigned a woman or a man from the household in which they live as a parent. -*/ - - - -*Create unique partnership identifier within each household -/*Cond(x,a,b) -Description: a if x is true and nonmissing, b if x is false; a if c is not specified and x evaluates to missing -pno -- person number -ppno -- partner's person number: PNO -*/ -gen apartnum = cond(pno0 - - -*by idhh, new file with mother id, father id, and apartnum. Then assign that apartnum to child. -preserve -keep swv idhh idperson apartnum -rename idperson idmother -rename apartnum apartnumm -gen idhhmother = idhh -save "$dir_data/temp_mother", replace -rename idmother idfather -rename apartnum apartnumf -gen idhhfather = idhh -save "$dir_data/temp_father", replace -restore - - -merge m:1 swv idhh idmother using "$dir_data/temp_mother" -keep if _merge == 1 | _merge == 3 -drop _merge -merge m:1 swv idhh idfather using "$dir_data/temp_father" -keep if _merge == 1 | _merge == 3 -drop _merge - - -*Keep children under age to become responsible with parents unless their partner lives in the hh: -*(children above age to become responsible will create independent households) -replace apartnum = apartnumm if missing(apartnum) & dag < $age_become_responsible & ppno == 0 //ppno == 0 ensures there is no partner living with them -replace apartnum = apartnumf if missing(apartnum) & dag < $age_become_responsible & ppno == 0 -drop apartnumm apartnumf - - -*Assign new HH numbers where there is more than 1 couple in the HH: -egen newid = group(swv idhh apartnum) -tostring(newid), replace -replace newid = "999999"+newid if newid != "." -destring(newid), replace -replace idhh = newid if apartnum > 1 & !missing(apartnum) - - -*If a single has a child ==> it should go with them. -*If aged above the age to become responsible, and pno > 1 (so more than 2 person in the HH) & ppno == 0 (partner not in the HH) should move out -cap drop newid -egen newid = group(swv idhh pno) if dag >= $age_become_responsible & pno > 1 & ppno == 0 -tostring(newid), replace -replace newid = "888888"+newid if newid != "." -destring(newid), replace -replace idhh = newid if !missing(newid) -drop newid - - -*Still some households with 3 adults? -bys swv idhh: egen adult_count = count(idperson) if dag > $age_become_responsible -fre adult_count -/* -egen newid = group(idhh pno) if adult_count > 2 & ppno == 0 & dag > $age_become_responsible -tostring(newid), replace -replace newid = "777777"+newid if newid != "." -destring(newid), replace -replace idhh = newid if !missing(newid) -drop newid adult_count -*/ - - -*Check for orphans: -gen orphan_dummy = 1 if dag < $age_become_responsible & idmother <0 & idfather <0 -bys swv idhh: egen orphan_hh = max(orphan_dummy) -tab orphan_dummy - -*Try to assign adult female id as mother, if not available adult male: -bys swv idhh: gen long idmother2 = idperson if dgn == 0 & dag > 18 //Keep at 18 and not age to become responsible as minimum age to give birth is 18? -gsort +swv +idhh -dag -by swv idhh: carryforward idmother2, replace -replace idmother = idmother2 if dag < $age_become_responsible & idmother<0 & idfather<0 & !missing(idmother2) - -bys swv idhh: gen long idfather2 = idperson if dgn == 1 & dag > 18 -gsort +swv +idhh -dag -by swv idhh: carryforward idfather2, replace -replace idfather = idfather2 if dag < $age_become_responsible & idmother<0 & idfather<0 & !missing(idfather2) - -/**************************Drop remaining orphans *********************************************/ -count if dag < $age_become_responsible & idmother<0 & idfather<0 -/*143 cases in total*/ -bys swv: count if dag < $age_become_responsible & idmother<0 & idfather<0 -drop if dag < $age_become_responsible & idmother<0 & idfather<0 -/**********************************************************************************************/ - - -*Check for same-sex couples -preserve -keep idhh ppno dgn -rename dgn dgn_partner -rename ppno pno -drop if pno == 0 -save "$dir_data/temp_sex", replace -restore - -merge 1:1 idhh pno using "$dir_data/temp_sex" -keep if _merge == 1 | _merge == 3 -drop _merge -gen same_sex_couple = 1 if dgn == dgn_partner & !missing(dgn) & !missing(dgn_partner) - - -*Check same-sex couples for children: father/mother should stay with children -*Double-check as might not work properly -bys swv idhh: egen samesex_hh = max(same_sex_couple) -tab samesex_hh if dag < $age_become_responsible //HH where same-sex couple is with someone < 18, N=19 - - -gen long parent_id_temp = idmother if samesex_hh == 1 & dag < 18 -replace parent_id_temp = idfather if parent_id_temp == -9 & samesex_hh == 1 -by swv idhh: egen long max_parent_id = max(parent_id_temp) -replace parent_id_temp = idperson if missing(parent_id_temp) & samesex_hh == 1 & !missing(max_parent_id) -drop max_parent_id -egen ss_parent = group(idhh parent_id_temp) - - -replace idmother = . if idmother < 0 -replace idfather = . if idfather < 0 - -*Break-up same-sex couples into separate households: -gen long idmother3 = idmother if dag < $age_become_responsible -replace idmother3 = idperson if missing(idmother3) & dgn == 0 -gen long idfather3 = idfather if dag < $age_become_responsible -replace idfather3 = idperson if missing(idfather3) & dgn == 1 - -sort swv idhh pno -bys swv idhh: gen dgn_hh = dgn if pno == 1 & samesex_hh == 1 -by swv idhh: carryforward dgn_hh, replace - - -//bys idhh same_sex_couple: replace same_sex_couple = . if _n != 1 //Assign new ID to one of the couple -egen newid2 = group(swv idhh idmother3) if samesex_hh == 1 & dgn_hh == 0 -egen newid3 = group(swv idhh idfather3) if samesex_hh == 1 & dgn_hh == 1 -replace newid3 = newid3 + 5000 -replace newid2 = newid3 if missing(newid2) -egen newid = group(swv idhh newid2) if samesex_hh == 1 - -replace idpartner = 0 if !missing(newid) == 1 //We don't allow same-sex partnerships in the simulation -tostring(newid), replace -replace newid = "666666"+newid if newid != "." -destring(newid), replace -replace idhh = newid if !missing(newid) -drop newid same_sex_couple dgn_partner - - -/************Drop same sex households (still do the split above in case we wanted to revert)*****************/ -count if samesex_hh==1 -bys swv: fre samesex_hh -/* 2,855 hhds in total, aprox 230 -250 in each wave */ -drop if samesex_hh == 1 -/************************************************************************************************************/ - - -* Clean up -*Set idpartner = 0 if single HH: -bys swv idhh: egen count = count(idperson) if dag >= $age_become_responsible | (dag < $age_become_responsible & ppno != 0) -fre count -replace idpartner = 0 if count == 1 - - -*"Home" variable -/*We decided to distinguish between a household (fiscal unit) and a "home". -For example children living with their parents will share the same "home". -Create a new variable that contains the original household id for adult children -1-For those who are not adult children, home id should be the same as idhh -2-For those who are adult children, home id should be the idhh before the split -(But: in the data there can be multigenerational families etc. that should still be split (?) -So the home variable can only be defined after household splitting -home id == idhh for everyone (the one we modified), but for households with adult children home id == hidp -*/ -gen double idhome = idhh -format idhome %15.0g -replace idhome = idhhmother if adultChildFlag == 1 & !missing(idhhmother) -replace idhome = idhhfather if adultChildFlag == 1 & missing(idhhmother) & !missing(idhhfather) -*/ - -/**************************************************************************************************************************/ -*DP: script from "UK Compile do-file" - a more recent version of a split -/**************************************************************************************************************************/ -* recode same sex couples as singles -replace idpartner = -9 if (ssscp==1) -replace dcpst = 2 if (ssscp==1) -foreach vv in dgnsp dagsp dehsp_c3 dhesp lessp_c3 lessp_c4 { - replace `vv' = -9 if (ssscp==1) -} - -* adult defined as 18 or over, or if married -cap gen adult = (dag>=$age_become_responsible) -replace adult = 1 if (adult==0 & dcpst==1) -cap gen child = 1 - adult - -* define benefit units -cap gen long idbenefitunit = . -cap gen long idbupartner = . - -order swv idhh idbenefitunit idbupartner idperson idpartner idmother idfather dag adult child -gsort swv idhh -dag -bys swv idhh: replace idbenefitunit = idperson[1] -bys swv idhh: replace idbupartner = idpartner[1] - -replace idbupartner = . if (adult==1 & idperson!=idbenefitunit & idpartner!=idbenefitunit) -replace idbenefitunit = . if (adult==1 & idperson!=idbenefitunit & idpartner!=idbenefitunit) -replace idbupartner = . if (child==1 & idfather!=idbenefitunit & idmother!=idbenefitunit & idfather!=idbupartner & idmother!=idbupartner) -replace idbenefitunit = . if (child==1 & idfather!=idbenefitunit & idmother!=idbenefitunit & idfather!=idbupartner & idmother!=idbupartner) -replace idbenefitunit = idperson if (missing(idbenefitunit) & adult==1 & (missing(idpartner) | idpartner<0)) -bys swv idhh: replace idbenefitunit = idperson if (missing(idbenefitunit) & adult==1 & !missing(idbenefitunit[_n-1]) & idpartner!=idbenefitunit[_n-1]) -replace idbupartner = idpartner if (missing(idbupartner) & idbenefitunit==idperson & !missing(idpartner) & idpartner>0) -bys swv idhh: replace idbenefitunit = idpartner if (missing(idbenefitunit) & adult==1 & !missing(idbenefitunit[_n-1]) & idpartner==idbenefitunit[_n-1]) -replace idbupartner = idperson if (missing(idbupartner) & idbenefitunit==idpartner) -replace idbenefitunit = idmother if (missing(idbenefitunit) & child==1 & idmother==idbenefitunit[_n-1]) -replace idbenefitunit = idfather if (missing(idbenefitunit) & child==1 & idfather==idbenefitunit[_n-1]) - -drop if idbenefitunit == . // 4403 observations deleted -drop if idbenefitunit<0 // 0 observations deleted -drop idbupartner - -// screen out benefit units with multiple adults of same sex -gen adultMan = adult * (dgn==1) -gen adultWoman = adult * (dgn==0) -gsort swv idbenefitunit -bys swv idbenefitunit: egen sumMen = sum(adultMan) -bys swv idbenefitunit: egen sumWomen = sum(adultWoman) -tab swv sumMen -tab swv sumWomen -drop if (sumWomen>1) // 1638 obserations -drop if (sumMen>1) // 14 observations - -// adjust bu identifiers to allow for possiblity that units are sampled in same calendar year -order idbenefitunit stm swv -gsort idbenefitunit stm swv -gen idtemp = 1 -gen switch = 0 -replace switch = 1 if (idbenefitunit!=idbenefitunit[_n-1] | swv!=swv[_n-1]) -order idbenefitunit stm swv idtemp switch -replace idtemp = idtemp[_n-1] + switch if (_n>1) - -drop idbenefitunit switch -rename idtemp idbenefitunit - -sum idbenefitunit, d - -gsort idbenefitunit -dag -by idbenefitunit: replace idhh = idhh[1] if (idhh != idhh[1]) - -order idbenefitunit, after(idhh) -sort idhh idbenefitunit - - -*************************************************** -*check for duplicates in terms of stm amd idperson* -*************************************************** -duplicates report swv idperson //no such cases// - -duplicates report stm idperson //16420 cases have duplicates// -duplicates report stm idhh idperson //16420 cases have duplicates, they also have the same hh ids // - -cap drop duplicate -duplicates tag stm idperson , generate(duplicate) -fre duplicate - -order stm idperson idhh idbenefitunit Int_Date, last /*duplicates appear due to the same persons interviewed twice -during the same calendar year, typically in Jan-Feb and then in Nov-Dec, so assume that the first interview is a catching up interview from previous wave -and keep only the second one . the alternative is to change the stm for the first interview to stm-1*/ - -sort duplicate stm idperson Int_Date -cap drop keep -by duplicate stm idperson: gen todrop = (_n>1) //16,423 obs will be dropped - -*drop duplicate observations -drop if todrop==1 - -duplicates report stm idperson -duplicates report stm idhh idperson - - -/********************************Drop households with missing values***********/ -cap gen dropObs = . //Generate variable indicating whether household should be dropped - -count if idbenefitunit == . -*bys swv: count if idbenefitunit == . -* 1,964 observations missing idbenefitunit identifier - mostly children living with grandparents -* 100-150 per wave -* drop these from sample, as beyond the modelling scope -replace dropObs = 1 if (idbenefitunit == .) - - -*4,250 obs (approx 300-400 by wave) have zero adults - these are children who were interviewed in a year other than that of their parents -cap drop adult_count -bys idhh stm: egen adult_count = sum(adult) -count if adult_count==0 -*bys swv: count if adult_count==0 -replace dropObs = 1 if (adult_count==0) - - -*Remove household if missing values present: (not using previous wave's values as migration possible) (353 obs) -count if drgn1 == -9 -replace dropObs = 1 if drgn1 == -9 - - -*Missing age (140 obs): -count if dag == -9 -replace dropObs = 1 if dag == -9 - -*Missing age of partner (but has a partner, 46 cases): -count if dagsp == -9 & idpartner != -9 -replace dropObs = 1 if dagsp == -9 & idpartner != -9 - - -*Health status - remove household if missing for adults - 0 cases due to imputation -count if (dhe == -9 ) & dag > $age_become_responsible -count if (dhe == -9 ) & dag>0 & dag<= $age_become_responsible -/*no missing cases due to imputations */ -replace dropObs = 1 if (dhe == -9) & dag > $age_become_responsible - -*Mental health status (1 obs): -count if dhm == -9 & dag > $age_become_responsible -count if dhm_ghq == -9 & dag > $age_become_responsible -/*no missing cases due to imputations */ -replace dropObs = 1 if dhm == -9 & dag > $age_become_responsible -replace dropObs = 1 if dhm_ghq == -9 & dag > $age_become_responsible - -*Health status of spouse - remove household if missing but individual has a spouse (46 obs) -count if dhesp == -9 & idpartner != -9 -/*no missing cases due to imputations */ -replace dropObs = 1 if (dhesp == -9) & idpartner != -9 - -*Education - remove household if missing education level for adults who are not in education (1,918 cases): -count if deh_c3 == -9 & dag >= $age_become_responsible & ded == 0 -replace dropObs = 1 if deh_c3 == -9 & dag >= $age_become_responsible & ded == 0 - -*Education of spouse - remove household if missing but individual has a spouse (14,720 obs) -count if dehsp_c3 == -9 & idpartner != -9 -replace dropObs = 1 if dehsp_c3 == -9 & idpartner != -9 - -*Parental education - 0 obs removed due to imputation -count if dehmf_c3 == -9 -replace dropObs = 1 if dehmf_c3 == -9 - -*Partnership status (808 obs): -count if dcpst == -9 -replace dropObs = 1 if dcpst == -9 - -*Activity status (392 cases): -count if les_c3 == -9 & dag >= $age_become_responsible -replace dropObs = 1 if les_c3 == -9 & dag >= $age_become_responsible - -*Activity status with retirement as a separate category (392 cases) -count if les_c4 == -9 & dag >= $age_become_responsible -replace dropObs = 1 if les_c4 == -9 & dag >= $age_become_responsible - -*Partner's activity status (30,481 cases) -count if lessp_c3 == -9 & idpartner != -9 -replace dropObs = 1 if lessp_c3 == -9 & idpartner != -9 - -*Own and spousal activity status (30,614) -count if lesdf_c4 == -9 & idpartner != -9 -replace dropObs = 1 if lesdf_c4 == -9 & idpartner != -9 - -*Household composition (808 cases): -count if dhhtp_c4 == -9 -replace dropObs = 1 if dhhtp_c4 == -9 - -*Income (14 cases): -count if ypnbihs_dv == -9 & dag >= $age_become_responsible //530 obs -count if yplgrs_dv == -9 & dag >= $age_become_responsible //704 obs -count if ydses_c5 == -9 //286 obs -count if ypncp == -9 & dag >= $age_become_responsible //0 obs - -replace dropObs = 1 if ypnbihs_dv == -9 & dag >= $age_become_responsible -replace dropObs = 1 if yplgrs_dv == -9 & dag >= $age_become_responsible -replace dropObs = 1 if ydses_c5 == -9 -replace dropObs = 1 if ypncp == -9 & dag >= $age_become_responsible - -*Indicator for households with missing values -cap drop dropHH -bys stm idhh: egen dropHH = max(dropObs) -bys stm: tab dropHH, mis -drop if stm<0 -save "$dir_data\ukhls_pooled_all_obs_05.dta", replace diff --git a/input/InitialPopulations/compile/06_reweight_and_slice.do b/input/InitialPopulations/compile/06_reweight_and_slice.do index 5e860572c..9e3142ba3 100644 --- a/input/InitialPopulations/compile/06_reweight_and_slice.do +++ b/input/InitialPopulations/compile/06_reweight_and_slice.do @@ -3,7 +3,7 @@ * WEIGHT ADJUSTMENT TO ACCOUNT FOR USING HOUSEHOLDS WITHOUT MISSING VALUES * * AUTH: Patryk Bronka, Daria Popova, Justin van de Ven -* LAST EDIT: 15 Dec 2025 DP +* LAST EDIT: 18 July 2025 DP * *********************************************************************/ ******************************************************************************** diff --git a/input/InitialPopulations/compile/09_finalise_input_data.do b/input/InitialPopulations/compile/09_finalise_input_data.do index 477110f0d..223fa7ba0 100644 --- a/input/InitialPopulations/compile/09_finalise_input_data.do +++ b/input/InitialPopulations/compile/09_finalise_input_data.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 15 Dec 2025 +* LAST UPDATE: 18 July 2025 * NOTE: Called from 00_master.do - see master file for further details *************************************************************************************** @@ -180,24 +180,24 @@ forvalues yy = $firstSimYear/$lastSimYear { sum one [w=dwt] *limit saved variables - keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 /// + keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 /// yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 /// stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt /// - potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// + potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress - order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen /// + order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen /// dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag /// - multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// + multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress - recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp /// + recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp /// dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 /// - adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs /// + adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dls dot unemp financial_distress (missing=-9) + formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress (missing=-9) gsort idhh idbenefitunit idperson save "$dir_data/population_initial_UK_$year.dta", replace @@ -207,7 +207,7 @@ forvalues yy = $firstSimYear/$lastSimYear { } cap log close -*************************************************************************************** +/**************************************************************************************** * finalise *************************************************************************************** #delimit ; @@ -215,7 +215,7 @@ local files_to_drop was_wealthdata.dta ; #delimit cr // cr stands for carriage return -/* + foreach file of local files_to_drop { erase "$dir_data/`file'" } diff --git a/input/InitialPopulations/compile/10_check_yearly_data.do b/input/InitialPopulations/compile/10_check_yearly_data.do index 4bd3570c7..7123dda82 100644 --- a/input/InitialPopulations/compile/10_check_yearly_data.do +++ b/input/InitialPopulations/compile/10_check_yearly_data.do @@ -6,24 +6,25 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova -* LAST UPDATE: 15 Dec 2025 DP +* LAST UPDATE: 18 July 2025 DP * NOTE: Called from 00_master.do - see master file for further details ***************************************************************************************/* -set matsize 15000 +set matsize 11000, permanently ********************************************************************************/ cap log close log using "${dir_log}/10_check_yearly_data.log", replace ******************************************************************************** + *all variables #delimit ; local varlist -idhh -idbenefitunit -idperson -idpartner -idmother -idfather +idhh +idbenefitunit +idperson +idpartner +idmother +idfather pno swv dgn @@ -35,7 +36,8 @@ ded deh_c3 sedex les_c3 -dlltsd +dlltsd +dlltsd01 dhe ydses_c5 yplgrs_dv @@ -89,7 +91,11 @@ ypnoab dhe_mcs dhe_pcs dot +dot01 unemp +dls +financial_distress +carehoursprovidedweekly ; #delimit cr // cr stands for carriage return @@ -111,22 +117,22 @@ lesdf_c4 les_c4 lessp_c4 drgn1 -dot +dot +dot01 ; #delimit cr // cr stands for carriage return *new varlist with categorical variables outputted by category #delimit ; -local varlist2 -idhh -idbenefitunit -idperson -idpartner -idmother -idfather -pno -ppno +local varlist2 +idhh +idbenefitunit +idperson +idpartner +idmother +idfather +pno swv dgn dag @@ -135,7 +141,8 @@ dnc02 dnc ded sedex -dlltsd +dlltsd +dlltsd01 ypncp ypnoab yplgrs_dv @@ -233,71 +240,46 @@ daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost -liquid_wealth +liquid_wealth +dhemcs +dhepcs dot_1 dot_2 dot_3 dot_4 -dot_5 +dot01_1 +dot01_2 +dot01_3 +dot01_4 +dot01_5 +dot01_6 +unemp +dls +financial_distress +carehoursprovidedweekly ; #delimit cr // cr stands for carriage return -cap erase "$dir_data/population_initial_UK_orig_sumstats.xls" + cap erase "$dir_data/population_initial_UK_sumstats.xls" cap erase "$dir_data/population_initial_fs_UK_sumstats.xls" -cap erase "$dir_data/population_initial_UK_orig_sumstats.txt" cap erase "$dir_data/population_initial_UK_sumstats.txt" cap erase "$dir_data/population_initial_fs_UK_sumstats.txt" -/******************************************************* -*output summary stats for orignal initial populations * -******************************************************* -forvalues year=2010/2017 { -insheet using "${dir_ipop_orig}/population_initial_UK_`year'.csv", clear -save "$dir_data/population_initial_UK_`year'_orig.dta", replace - -gen adult = dag>=$age_become_responsible -gen child = 1 - adult -gen dehmf_c3 = 0 -gen dhe_mcs = 0 -gen dhe_pcs = 0 -gen dot = 0 -gen unemp = 0 - - -foreach var of local varlist_cat { -recode `var' (0=.) (-9=.) -cap drop `var'_* -tab `var', gen(`var'_) - } - - -foreach var of local varlist2 { -recode `var' (-9=.) - } - -foreach var in need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// -liquid_wealth { -recode `var' (.=0) -} - - -order `varlist2' -qui sum `varlist2' , de -save "$dir_data/population_initial_UK_`year'_orig.dta", replace -outreg2 using "$dir_data/population_initial_UK_orig_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2') - -} -*/ ******************************************************* *output summary stats for new initial populations * ******************************************************* -forvalues year=2010/2023 { +forvalues year=2010/2023 { use "$dir_data/population_initial_UK_`year'.dta", clear +cap drop dhemcs dhepcs +clonevar dhemcs=dhe_mcs +clonevar dhepcs=dhe_pcs + + foreach var of local varlist_cat { recode `var' (0=.) (-9=.) cap drop `var'_* @@ -309,25 +291,24 @@ foreach var of local varlist2 { recode `var' (-9=.) } -foreach var in need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// -liquid_wealth { +foreach var in need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost liquid_wealth carehoursprovidedweekly { recode `var' (.=0) } - order `varlist2' qui sum `varlist2' , de -save "$dir_data/population_initial_UK_`year'.dta", replace +//save "$dir_data/population_initial_UK_`year'.dta", replace outreg2 using "$dir_data/population_initial_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2') } - +/* ********************************************************************** *output summary stats for new initial populations before dropping hhs* ********************************************************************** -forvalues year=2010/2023 { +forvalues year=2010/2023 { use "$dir_data/population_initial_fs_UK_`year'.dta", clear +rename careHoursProvidedWeekly carehoursprovidedweekly cap gen dwt_sampling =0 cap gen uk_pop=0 @@ -336,6 +317,10 @@ cap gen multiplier=0 cap gen adult = dag>=$age_become_responsible cap gen child = 1 - adult +cap drop dhemcs dhepcs +clonevar dhemcs=dhe_mcs +clonevar dhepcs=dhe_pcs + foreach var of local varlist_cat { recode `var' (0=.) (-9=.) cap drop `var'_* @@ -347,45 +332,54 @@ foreach var of local varlist2 { recode `var' (-9=.) } + foreach var in need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// -liquid_wealth { +liquid_wealth carehoursprovidedweekly { recode `var' (.=0) } - +keep `varlist2' order `varlist2' qui sum `varlist2' , de -save "$dir_data/population_initial_fs_UK_`year'.dta", replace +//save "$dir_data/population_initial_fs_UK_`year'.dta", replace outreg2 using "$dir_data/population_initial_fs_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2') } +*/ -cap erase "$dir_data/population_initial_UK_orig_sumstats.txt" cap erase "$dir_data/population_initial_UK_sumstats.txt" cap erase "$dir_data/population_initial_fs_UK_sumstats.txt" cap log close - - - + +/* ************************************************************* *clean up new initial populations - keep only required vars * ************************************************************* -/* forvalues year=2010/2023 { insheet using "$dir_data/population_initial_UK_`year'.csv", clear -keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex les_c3 dlltsd dhe /// -ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp /// -lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned scghq2_dv_miss_flag lhw drgn1 dct dwt_sampling les_c4 dhm_ghq /// -lessp_c4 adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly /// -liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// - aidhrs carewho ypncp ypnoab dhe_mcs dhe_pcs dot unemp - - -save "$dir_data/population_initial_UK_`year'.dta", replace -outsheet using "$dir_data/population_initial_UK_`year'.csv", nolabel replace + *limit saved variables + keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 /// + yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 /// + stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt /// + potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp + + order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen /// + dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag /// + multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp + + recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp /// + dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 /// + adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs /// + formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp (missing=-9) + + gsort idhh idbenefitunit idperson + save "$dir_data/population_initial_UK_`year'.dta", replace + export delimited using "$dir_data/population_initial_UK_`year'.csv", nolabel replace } */ diff --git a/input/InitialPopulations/compile/RegressionEstimates/master.do b/input/InitialPopulations/compile/RegressionEstimates/master.do new file mode 100644 index 000000000..2e4ec42cc --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/master.do @@ -0,0 +1,139 @@ + +*************************************************************************************** +* PROJECT: ESPON: regression estimates for SimPaths using UKHLS data +* DO-FILE NAME: master.do +* DESCRIPTION: Main do-file to set the main parameters (country, paths) and call sub-scripts +*************************************************************************************** +* COUNTRY: UK +* DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] +* +* AUTHORS: Daria Popova, Justin van de Ven +* LAST UPDATE: 1 july 2025 DP +*************************************************************************************** + +*************************************************************************************** +* General comments: +* - Note that in the following scripts some standard commands may be +* abbreviated: (gen)erate, (tab)ulate, (sum)marize, (di)splay, +* (cap)ture, (qui)etly, (noi)sily + +*Stata packages to install +*ssc install fre +*ssc install tsspell +*ssc install carryforward +*ssc install outreg2 +*ssc install oparallel +*ssc install gologit2 +* NOTES: Output formatting automated, however if you decide to +* add or take-away variables from the processes you +* will need to update the labelling in the excel files. +* +* The income and union parameter do file must be run after +* the wage estimates are obtain because they use +* predicted wages. The order of the remaining files is +* arbitrary. +*************************************************************************************** +*************************************************************************************** + +clear all +set more off +set type double +set maxvar 30000 +set matsize 1000 + + +/************************************************************************************** +* DEFINE DIRECTORIES +**************************************************************************************/ + +* Working directory +global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates" + +* Directory which contains do files +global dir_do "${dir_work}/do" + +* Directory which contains log files +global dir_log "${dir_work}/log" + +* Directory which contains raw output: Excel and Word tables +global dir_raw_results "${dir_work}/raw_results" + +* Directory which contains final Excel files read by the model +global dir_results "${dir_work}/results" + +* Directory which contains pooled dataset for estimates +global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data" + +* Directory containing external input data +global dir_external_data "$dir_work/external_data" + +* Directory containing results of comparison of various weights +global weight_checks "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\weight_checks" + +*********************Internal validation**************************************** +* Directory to save data for internal validation +global dir_validation_data "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\data" + +* Directory for internal validation do-files +global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files" + +* Directory for internal validation do-files +global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files" + +* Directory for internal validation do-files +global dir_validation_graphs "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\graphs" + +global countyy "UK" + +/******************************************************************************* +* ESTIMATION FILES +*******************************************************************************/ +/* +do "${dir_do}/reg_education.do" + +do "${dir_do}/reg_leaveParentalHome.do" + +do "${dir_do}/reg_partnership.do" + +do "${dir_do}/reg_fertility.do" +*/ + +do "${dir_do}/reg_health.do" + +/* +do "${dir_do}/reg_home_ownership.do" + +do "${dir_do}/reg_retirement.do" + +do "${dir_do}/reg_wages.do" + +do "${dir_do}/reg_income.do" + + + +/* +******************************************************************************* +* INTERNAL VALIDATION FILES +*******************************************************************************/ + +do "$dir_do_validation/int_val_education.do" + +do "$dir_do_validation/int_val_leave_parental_home.do" + +do "$dir_do_validation/int_val_partnership.do" + +do "$dir_do_validation/int_val_fertility.do" + +do "$dir_do_validation/int_val_health.do" + +do "$dir_do_validation/int_val_home_ownership.do" + +do "$dir_do_validation/int_val_retirement.do" + +do "$dir_do_validation/int_val_wages.do" + +do "$dir_do_validation/int_val_income.do" + +/************************************************************************************** +* END OF FILE +**************************************************************************************/ diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do b/input/InitialPopulations/compile/RegressionEstimates/reg_education.do index f5485e6e2..881ab29a5 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_education.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_education.do @@ -1,10 +1,15 @@ ******************************************************************************** * PROJECT: ESPON * SECTION: Education -* OBJECT: Final Probit Models - Weighted -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) -******************************************************************************** +* OBJECT: Final Probit & Generalised Logit Models - Weighted +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +* NOTES: +* +******************************************************************************** + clear all set more off set mem 200m @@ -12,174 +17,899 @@ set type double //set maxvar 120000 set maxvar 30000 +******************************************************************* +cap log close +log using "${dir_log}/reg_education.log", replace +******************************************************************* -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -* Directory which contains do files -global dir_do "${dir_work}/do" +do "$dir_do/variable_update" -* Directory which contains data files -global dir_data "${dir_work}/data" -* Directory which contains log files -global dir_log "${dir_work}/log" -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" +* Sample selection +drop if dag < 16 -******************************************************************* -cap log close -log using "${dir_log}/reg_education.log", replace -******************************************************************* +xtset idperson swv -use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear +* Set Excel file +* Info sheet +putexcel set "$dir_results/reg_education", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection of education status" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable deh_c3 "Educational Attainment: 3 Category" -/* -label variable dehm_c3 "Mother's Educational Attainment: 3 Category" -label variable dehf_c3 "Father's Educational Attainment: 3 Category" -*/ -label variable dehmf_c3 "Highest Parental Educational Attainment: 3 Category" -label variable dhhtp_c4 "Household Type: 4 Category" -label variable dnc "Number of Children in Household" -label variable dnc02 "Number of Children aged 0-2 in Household" - -label value dgn gdr -label value drgn1 rgna -label value les_c3 jbf -label value deh_c3 dehmf_c3 /*dehm_c3 dehf_c3*/ edd -label value ded yn -label value dhhtp_c4 hht +putexcel A5 = "E1a" +putexcel B5 = "Probit regression estimates of remaining in continuous education - individuals aged 16-29 in initial education spell" -drop if dag < 16 +putexcel A6 = "E1b" +putexcel B6 = "Probit regression estimates of returning to education - individuals aged 16-35 not in initial education spell" -replace stm = stm - 2000 -fre stm +putexcel A7 = "E2a" +putexcel B7 = "Generalized ordered logit regression estimates of education attainment - individuals aged 16-29 exiting education that were in initial education spell in t-1 but not in t" +putexcel B8 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table" +putexcel B9 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name." -/*check if all covariates are available in the data*/ -recode ded dgn dag dagsq dehmf_c3 drgn1 stm deh_c3 les_c3 (-9=.) +putexcel A10 = "Notes:", bold +putexcel B10 = "Added: ethnicity-4 cat (dot); covid dummies (y2020 y2021)" -xtset idperson swv +putexcel set "$dir_results/reg_education", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + +************************************************************ +* E1a: Probability of Remaining in Initial Education Spell * +************************************************************ +* Process E1a: Remaining in the initial education spell. +* Sample: Individuals aged 16-29 who have not left their initial education spell +* DV: In continuous education dummy +* Note: Condition implies some persistence - education for the last 2 years. -********************************** -*Probability of Being a Student * -********************************** -*Process E1a: Probability of being in education. Sample: Individuals aged 16-29 in continuous education. -*or probability of remaining in education for those who have always been in education without interruptions. +fre ded if (dag >= 16 & dag <= 29 & l.ded == 1) +// was in initial education spell in the previous wave +// 70.1% remain in education -*sample: Individuals aged 16-29 in continuous education. -fre ded if (dag>=16 & dag<=29 & l.ded==1) /*was in continious education in the previous wave */ +/*////////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// + if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E1b, dimlwt) side dec(4) -probit ded i.dgn dag dagsq ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimxwt], vce(robust) +probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// + if (dag>=16 & dag<=29 & l.ded==1) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, disclwt) side dec(4) + +probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// + if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_E1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ +probit ded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// + if (dag>=16 & dag<=29 & l.ded==1) [pweight=dimxwt], vce(robust) + + * save raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/education", sheet("Process E1a") replace +putexcel set "$dir_raw_results/education/education", sheet("Process E1a") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/E1a.doc", replace /// -title("Process E1a: Probability of remaining in continuous education - individuals aged 16-29 in continuous education.") /// +outreg2 stats(coef se pval) using "$dir_raw_results/education/E1a.doc", replace /// +title("Process E1a: Probability of remaining in initial education spell - individuals aged 16-29 in initial education spell.") /// ctitle(Continuing student) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/E1a_sample", replace + + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) -**************************************** -*Probability of Returning to education * -**************************************** -*Process E1b: Probability of being in education. Sample: Individuals aged 16-35 not in continuous education. -*Or probability of returning to education for those who had left school. -*sample: Individuals aged 16-35 not in continuous education. -fre der if (dag>=16 & dag<=35 & ded==0) +* Results -probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=35 & ded==0) [pweight=dimlwt], vce(robust) +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/education/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/education/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_education", sheet("UK_E1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_education", sheet("UK_E1a") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGREESOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Dehmf_c3_Medium" +putexcel A6 = "Dehmf_c3_Low" +putexcel A7 = "UKC" +putexcel A8 = "UKD" +putexcel A9 = "UKE" +putexcel A10 = "UKF" +putexcel A11 = "UKG" +putexcel A12 = "UKH" +putexcel A13 = "UKJ" +putexcel A14 = "UKK" +putexcel A15 = "UKL" +putexcel A16 = "UKM" +putexcel A17 = "UKN" +putexcel A18 = "Year_transformed" +putexcel A19 = "Y2020" +putexcel A20 = "Y2021" +putexcel A21 = "Ethn_Asian" +putexcel A22 = "Ethn_Black" +putexcel A23 = "Ethn_Other" +putexcel A24 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Dehmf_c3_Medium" +putexcel G1 = "Dehmf_c3_Low" +putexcel H1 = "UKC" +putexcel I1 = "UKD" +putexcel J1 = "UKE" +putexcel K1 = "UKF" +putexcel L1 = "UKG" +putexcel M1 = "UKH" +putexcel N1 = "UKJ" +putexcel O1 = "UKK" +putexcel P1 = "UKL" +putexcel Q1 = "UKM" +putexcel R1 = "UKN" +putexcel S1 = "Year_transformed" +putexcel T1 = "Y2020" +putexcel U1 = "Y2021" +putexcel V1 = "Ethn_Asian" +putexcel W1 = "Ethn_Black" +putexcel X1 = "Ethn_Other" +putexcel Y1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_education", sheet("Gof") modify + +putexcel A3 = "E1a - Remaining in initial education spell", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +********************************************** +* E1b: Probability of Returning to Education * +********************************************** + +* Process E1b: Retraining having previously entered the labour force. +* Sample: Individuals aged 16-35 who have left their initial education spell +* and not a student last year +* DV: Return to education + +fre der if (dag >= 16 & dag <= 35 & ded == 0) +// 69.3% remain out of education + +/*////////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// +if (dag >= 16 & dag <= 35 & ded==0 & l.der==0) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E1b, dimlwt) side dec(4) + +probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// +if (dag >= 16 & dag <= 35 & ded==0 & l.der==0) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, disclwt) side dec(4) + +probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// +if (dag >= 16 & dag <= 35 & ded==0 & l.der==0) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_E1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E1b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_E1b.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ +probit der i.dgn dag dagsq lib1.deh_c3 li.les_c3 l.dnc l.dnc02 ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot /// +if (dag >= 16 & dag <= 35 & ded==0 & l.der==0) /// + [pweight=dimxwt], vce(robust) + + * save raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/education", sheet("Process E1b") modify +putexcel set "$dir_raw_results/education/education", sheet("Process E1b") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/E1b.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/education/E1b.doc", replace /// title("Process E1b: Probability of returning to education - individuals aged 16-35 not in continuous education.") /// ctitle(Returning student) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) -******************************************** -*Educational level after leaving education * -******************************************** -*Process E2: Educational attainment. Sample: Respondents from Process 1a who have left education. -*Or Level of education for those leaving education. +predict p -*sample: Individuals aged 16-29 who were in continuous education and left it. -fre deh_c3 if (dag>=16 & dag<=29) & l.ded==1 & ded==0 +save "$dir_validation_data/E1b_sample", replace -/* -mprobit deh_c3 i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if sedcsmpl==1 [pweight=dimxwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/education.xlsx", sheet("Process E2 - Education Level") modify -putexcel A1 = matrix(results), names nformat(number_d2) +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) -mprobit deh_c3 i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if sedcsmpl==1 [pweight=dimxwt], vce(robust) -matrix e2=get(VCE) -matrix list e2 -putexcel set "$dir_data/edu_vcm.xlsx", sheet("Process E2 - Education Level") modify -putexcel A1 = matrix(e2), names -//capture log close -*/ +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/education/var_cov", sheet("var_cov") /// + replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/education/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_education", sheet("UK_E1b") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_education", sheet("UK_E1b") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Deh_c3_Medium_L1" +putexcel A6 = "Deh_c3_Low_L1" +putexcel A7 = "Les_c3_NotEmployed_L1" +putexcel A8 = "Dnc_L1" +putexcel A9 = "Dnc02_L1" +putexcel A10 = "Dehmf_c3_Medium" +putexcel A11 = "Dehmf_c3_Low" +putexcel A12 = "UKC" +putexcel A13 = "UKD" +putexcel A14 = "UKE" +putexcel A15 = "UKF" +putexcel A16 = "UKG" +putexcel A17 = "UKH" +putexcel A18 = "UKJ" +putexcel A19 = "UKK" +putexcel A20 = "UKL" +putexcel A21 = "UKM" +putexcel A22 = "UKN" +putexcel A23 = "Year_transformed" +putexcel A24 = "Y2020" +putexcel A25 = "Y2021" +putexcel A26 = "Ethn_Asian" +putexcel A27 = "Ethn_Black" +putexcel A28 = "Ethn_Other" +putexcel A29 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Deh_c3_Medium_L1" +putexcel G1 = "Deh_c3_Low_L1" +putexcel H1 = "Les_c3_NotEmployed_L1" +putexcel I1 = "Dnc_L1" +putexcel J1 = "Dnc02_L1" +putexcel K1 = "Dehmf_c3_Medium" +putexcel L1 = "Dehmf_c3_Low" +putexcel M1 = "UKC" +putexcel N1 = "UKD" +putexcel O1 = "UKE" +putexcel P1 = "UKF" +putexcel Q1 = "UKG" +putexcel R1 = "UKH" +putexcel S1 = "UKJ" +putexcel T1 = "UKK" +putexcel U1 = "UKL" +putexcel V1 = "UKM" +putexcel W1 = "UKN" +putexcel X1 = "Year_transformed" +putexcel Y1 = "Y2020" +putexcel Z1 = "Y2021" +putexcel AA1 = "Ethn_Asian" +putexcel AB1 = "Ethn_Black" +putexcel AC1 = "Ethn_Other" +putexcel AD1 = "Constant" + +* Goodness of fit + +putexcel set "$dir_results/reg_education", sheet("Gof") modify + +putexcel A8 = "E1b - Returning to education", bold + +putexcel A10 = "Pseudo R-squared" +putexcel B10 = r2_p +putexcel A11 = "N" +putexcel B11 = N +putexcel E10 = "Chi^2" +putexcel F10 = chi2 +putexcel E11 = "Log likelihood" +putexcel F11 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +************************************************* +* E2a Educational Level After Leaving Education * +************************************************* + +* Process E2a: Educational level achieved when leaving the initial spell of +* education +* Sample: Those 16-29 who have left their initial education spell in current +* year +* DV: Education level (3 cat) +* Note: Previously tried a multinomial probit, now use a generalised ordered logit + +fre deh_c3 if (dag >= 16 & dag <= 29) & l.ded == 1 & ded == 0 + +recode deh_c3 (1 = 3) (3 = 1), gen(deh_c3_recoded) +lab def deh_c3_recoded 1 "Low" 2 "Medium" 3 "High" +lab val deh_c3_recoded deh_c3_recoded + + +/* Model specification tests + +local model_specification_test=0 + +if `model_specification_test' == 0 { + + * Option 1 - Ordered logit + + * Testing the parallel lines assumption + * - the model asssumes that coefs (apart for the constant) when estimating + * a series of binary probits for 1 vs higher, 1&2 vs higher, 1&2&3 vs + * higher + * - Brant test null: the slope coefficients are the same across response + * all categories (p<0.05 -> violating the prop odds assumption) + + sort idperson swv + + + ologit deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 /// + [pweight = dimxwt], vce(robust) + + oparallel, ic /*note: all tests have very high Chi2 statistics with p-values of 0.000.the parallel lines assumption is violated.*/ + + + * Option 2 - Linear model + + xtset idperson swv + + reg deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) + + + // obtain distribution of predicted values plot + // make sure to add in sampling variance + gen in_sample = e(sample) + + scalar sigma = e(rmse) + gen epsilon = rnormal()*sigma + sum epsilon + predict pred_edu if in_sample == 1 + replace pred_edu = pred_edu + epsilon if in_sample == 1 + + twoway (hist deh_c3_recoded if in_sample == 1 , lcolor(gs12) /// + fcolor(gs12)) (hist pred_edu if in_sample == 1 , /// + fcolor(none) lcolor(red)), xtitle (Education level) /// + legend(lab(1 "Observed") lab( 2 "Predicted")) name(levels, replace) /// + graphregion(color(white)) + drop in_sample pred_edu epsilon -/******************************************************************************* -* Ordered probit model to replace multinomial probit E2a -*******************************************************************************/ + sort idperson swv + + + * Option 3 - Generalized ordered logit + + gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit + // does the model produce any negative probabilities? + // if so, + // 1 - play around with the controls + // 2 - consider in the simulation converting the negative probabilities + // to be zero and rescaling the cdf to sum to 1 + +} +*/ -*1. Recode education level (outcome variable) so 1 = Low education, 2 = Medium education, 3 = High education -recode deh_c3 /// - (1 = 3) /// - (3 = 1) /// - , gen(deh_c3_recoded) +* Generalized ordered logit +sort idperson swv +/* +////////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight=dimlwt], vce(robust) autofit +outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(E2a, dimlwt) side dec(4) + +gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = disclwt], vce(robust) autofit +outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E2a, disclwt) side dec(4) + +gologit2 deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit +outreg2 using "${weight_checks}/weight_comparison_E2a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(E2a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_E2a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ +gologit2 deh_c3_recoded i.Dgn Dag Dag_sq /// + i.Dehmf_c3_Medium i.Dehmf_c3_Low /// + i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN /// + Year_transformed Y2020 Y2021 /// + i.Ethn_Asian i.Ethn_Black i.Ethn_Other /// +if dag >= 16 & dag <= 29 & l.ded == 1 & ded == 0 [pweight = dimxwt], vce(robust) autofit -la def deh_c3_recoded 1 "Low" 2 "Medium" 3 "High" -la val deh_c3_recoded deh_c3_recoded +*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it. -//oprobit deh_c3_recoded i.dgn dag dagsq ib1.dehm_c3 ib1.dehf_c3 ib8.drgn1 stm if (dag>=16 & ded == 0) [pweight=dimxwt], vce(robust) -oprobit deh_c3_recoded i.dgn dag dagsq ib1.dehmf_c3 /*ib1.dehm_c3 ib1.dehf_c3*/ ib8.drgn1 stm if (dag>=16 & dag<=29 & l.ded==1 & ded==0) [pweight=dimxwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/education", sheet("Process E2a") modify +putexcel set "$dir_raw_results/education/education", sheet("Process E2a") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/E2a.doc", replace /// -title("Process E2a: Ordered probit for educational attainment - individuals aged 16-29 exiting education.") /// +outreg2 stats(coef se pval) using "$dir_raw_results/education/E2a.doc", replace /// +title("Process E2a: Generalized ordered logit for educational attainment - individuals aged 16-29 who have left initial education spell.") /// ctitle(Education attainment) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p1 p2 p3 + +* Save sample for later use (internal validation) +save "$dir_validation_data/E2a_sample", replace + +* Store model summary statistics +scalar r2_p = e(r2_p) +scalar N_sample = e(N) + +* Store results in Excel + +* Store estimates in matrices +matrix b = e(b) +matrix V = e(V) + +* Raw output +putexcel set "$dir_results/reg_education", sheet("E2a_raw") modify +putexcel A1 = matrix(b'), names //nformat(number_d2) +putexcel A1 = "CATEGORY" +putexcel B1 = "REGRESSOR" +putexcel C1 = "COEFFICIENT" + +* Estimated coefficients +scalar no_coefs_all = colsof(b) + +* Eliminate rows and columns containing zeros (baseline cats) +mata: + // Call matrices into mata + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + nonzero_b = select(b, keep) + + // Inspect + nonzero_b + + // Return to Stata + st_matrix("nonzero_b", nonzero_b) + st_matrix("nonzero_b_flag", keep) +end + +* Inspect +matrix list b +matrix list nonzero_b +matrix list nonzero_b_flag + +* Save dimensions +scalar no_nonzero_b = colsof(nonzero_b) +scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 + +* Address repetition of proportional odds covariates + +* Generate repetition/unique observation flag +mata: + // Import matrices into mata + nonzero_b_mata = st_matrix("nonzero_b") + + // Generate binary vector =1 if coefficient repeated + n = cols(nonzero_b_mata) + repetition_flag = J(n, 1, 0) + + // use tolerance based comparison to avoid precision errors + tol = 1e-8 + + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { + repetition_flag[i] = 1 + break + } + } + } + repetition_flag + + // Generate binary vector =1 if coefficient not repeated + unique_flag = 1 :- repetition_flag + + // Return to Stata + st_matrix("repetition_flag", repetition_flag') + st_matrix("unique_flag", unique_flag') + +end + +* Generate vector to multiply the coef vector with to eliminate the +* repetitions of coefficients for vars that satify the proportional odds assumptions +matrix structure_a = J(1,no_nonzero_b_per,1) +matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] +matrix structure = structure_a, structure_b + +* Inspect +matrix list structure_a +matrix list structure_b +matrix list structure +matrix list nonzero_b + +* Eliminate repetitions +mata: + // Call matrices into mata + var = st_matrix("var") + structure = st_matrix("structure") + nonzero_b = st_matrix("nonzero_b") + + // Convert reptitions into zeros + b_structure = structure :* nonzero_b + + b_structure + + // Eliminate zeros + keep = (b_structure :!= 0) + + nonzero_b_structure = select(b_structure, keep) + + // Export to Stata + st_matrix("b_structure", b_structure) + st_matrix("nonzero_b_structure", nonzero_b_structure) + +end + +matrix list nonzero_b_structure + +* Export into Excel +putexcel set "$dir_results/reg_education", sheet("UK_E2a") modify +putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) + + + +* Variance-covariance matrix +* ELiminate zeros (baseline categories) +mata: + V = st_matrix("V") + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + V_trimmed + + // Return to Stata + st_matrix("var", V_trimmed) +end + +matrix list var + +* Address repetition due to proportional odds being satisfied for some covars +matrix square_structure_a = J(no_nonzero_b,1,1) * structure +matrix square_structure_b = square_structure_a' + +matrix list square_structure_a +matrix list square_structure_b +mata: + // Call matrices into mata + var = st_matrix("var") + + // Create structure matrix (0 = eliminate) + square_structure_a = st_matrix("square_structure_a") + square_structure_b = st_matrix("square_structure_b") + + // Element-by-element multiplication + square_structure = square_structure_a :* square_structure_b + var_structure = square_structure :* var + + // Eliminate zeros + row_keep = rowsum(abs(var_structure)) :!= 0 + col_keep = colsum(abs(var_structure)) :!= 0 + + nonzero_var_structure = select(select(var_structure, row_keep), col_keep) + + // Return to Stata + st_matrix("nonzero_var_structure", nonzero_var_structure) +end + +matrix list nonzero_var_structure + +* Export to Excel +putexcel set "$dir_results/reg_education", sheet("UK_E2a") modify +putexcel C2 = matrix(nonzero_var_structure) + + +* Labels +putexcel set "$dir_results/reg_education", sheet("UK_E2a") modify + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +/* Create temporary frame ==> not available in stata 14 +frame create temp_frame +frame temp_frame: { + + mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Create and clean labels + // Address lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Remove 1. + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + + // Constant + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add v1 + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Create temp file with results + fh = fopen("$dir_results/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) + end + */ + * Here's a replacement for stata 14: +local dir_results "$dir_results" + +preserve +* Run Mata block +mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Handle lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category name when flag is not unique + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Clean labels + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + // Filter for structure == 1 + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add header row + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Write to temporary file + fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) +end + + * Import cleaned labels into Stata as new dataset + import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) + gen n = _n + + * Export labels to Excel + putexcel set "$dir_results/reg_education", sheet("UK_E2a") modify + + * Vertical labels + sum n, meanonly + local N = r(max)+1 + + forvalue i = 2/`N' { + local j = `i' - 1 + putexcel A`i' = v1[`j'] + } + + * Horizontal labels + sum n, meanonly + local N = r(max) + 1 // Adjusted since we're working across columns + + forvalues j = 1/`N' { + local n = `j'+2 // Shift by 2 to start from column C + local col "" + + while `n' > 0 { + local rem = mod(`n' - 1, 26) + local col = char(65 + `rem') + "`col'" + local n = floor((`n' - 1)/26) + } + + putexcel `col'1 = v1[`j'] + } + + *Clean up + erase "$dir_results/temp_labels.txt" + + +* Goodness of fit + +putexcel set "$dir_results/reg_education", sheet("Gof") modify + +putexcel A13 = "E2a - Education attainment, not in initial education spell", bold + +putexcel A15 = "Pseudo R-squared" +putexcel B15 = r2_p +putexcel A16 = "N" +putexcel B16 = N_sample + +restore +* Clean up +drop in_sample p1 p2 p3 +scalar drop _all +matrix drop _all +//frame drop temp_frame + capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do index 9ae5d272d..5f0e6a6bc 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do @@ -3,7 +3,11 @@ * SECTION: Fertility * OBJECT: Final Probit Models * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +* NOTES: Simplified the fertility process for those in this initial +* education spell. ******************************************************************************** clear all set more off @@ -12,116 +16,415 @@ set type double //set maxvar 120000 set maxvar 30000 +******************************************************************* +cap log close +log using "${dir_log}/reg_fertility.log", replace +******************************************************************* +use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +do "$dir_do/variable_update" -* Directory which contains do files -global dir_do "${dir_work}/do" -* Directory which contains data files -global dir_data "${dir_work}/data" +* sample selection +drop if dag < 16 -* Directory which contains log files -global dir_log "${dir_work}/log" -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" +* Set Excel file +* Info sheet -******************************************************************* -cap log close -log using "${dir_log}/reg_fertility.log", replace -******************************************************************* -use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear +putexcel set "$dir_results/reg_fertility", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection of fertility" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "F1a" +putexcel B5 = "Probit regression estimates of the probability of having a child for women aged 18-44 in initial education spell" +putexcel A6 = "F1b" +putexcel B6 = "Probit regression estimates of probability of having a child for women aged 18-44 not in initial education spell" -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable dhhtp_c4 "Household Type: 4 Category" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable dhe "Self-rated Health" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable dnc "Number of Children in Household" -label variable dnc02 "Number of Children aged 0-2 in Household" -label variable ydses_c5 "Annual Household Income Quintile" -label variable dukfr "UK Fertility Rate" - -label value dgn gdr -label value drgn1 rgna -label value dhhtp_c4 hht -label value les_c3 jbf -label value deh_c3 edd -label value ded yn +putexcel A10 = "Notes:", bold +putexcel B10 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot), covid dummies (y2020 y2021)" +putexcel B11 = "F1a: only 24 obs having a child when in initial education spell, therefore have to take away some covariates to obtain estimate" -drop if dag < 16 -replace stm = stm - 2000 -/*check if all covariates are available in the data*/ -recode dhe dnc dnc02 deh_c3 les_c3 ydses_c5 dcpst drgn1 sprfm scedsmpl dukfr (-9=. ) -recode dchpd (-9=0) +putexcel set "$dir_results/reg_fertility", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold xtset idperson swv +********************************************** +* F1a - Having a child, in initial edu spell * +********************************************** + +* Process F1a: Probabiltiy of having a child +* Sample: Women aged 18-44, in initial education spell education. +* DV: New born child dummy (note that in the estimation sample dchpd contains the number of newborn children, which could be >1) + +replace dchpd=1 if dchpd>1 & dchpd<. +// only 69 ppl meet the condition in total +tab dchpd if (sprfm == 1 & ded == 1) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if /// + sprfm == 1 & ded == 1 [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(F1a, dimlwt) side dec(4) -********************************************************************** -*Proces F1a - Probability of Having a Child - In continuous education -********************************************************************** -*Sample: Women aged 18-44 not in continuous education. -probit dchpd dag l.dnc il.dnc02 ib1.dcpst if (sprfm==1 & scedsmpl==1) [pweight=disclwt], vce(robust) +probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if /// + sprfm == 1 & ded == 1 [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1a, disclwt) side dec(4) + +probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if /// + sprfm == 1 & ded == 1 [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_F1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ + +probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if /// + sprfm == 1 & ded == 1 [pweight=dimxwt], vce(robust) + + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/Fertility_w", sheet("Process F1a - In education") replace +putexcel set "$dir_raw_results/fertility/fertility", sheet("Process F1a - In education") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/F1a.doc", replace /// -title("Process F1a: Probability of giving birth to a child. Sample: Women aged 18-44 in continuous education.") /// +outreg2 stats(coef se pval) using "$dir_raw_results/fertility/F1a.doc", replace /// +title("Process F1a: Probability of giving birth to a child. Sample: Women aged 18-44 in initial education spell.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/F1a_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_fertility", sheet("UK_F1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_fertility", sheet("UK_F1a") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dag" +putexcel A3 = "Dcpst_Single" +putexcel A4 = "Year_transformed" +putexcel A5 = "Ethn_Black" +putexcel A6 = "Constant" +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dag" +putexcel D1 = "Dcpst_Single" +putexcel E1 = "Year_transformed" +putexcel F1 = "Ethn_Black" +putexcel G1 = "Constant" -************************************************************************ -*Proces F1b Probability of Having a Child - Not in continuous education -************************************************************************* -*Sample: Women aged 18-44 not in continuous education. -gen ddnc02 = (dnc02 > 0) -probit dchpd dag dagsq l.dnc l.ddnc02 ib1.dhe ib1.dcpst dukfr li.les_c3 ib8.drgn1 if (sprfm==1 & scedsmpl==0) [pweight=disclwt], vce(robust) + +* Goodness of fit +putexcel set "$dir_results/reg_fertility", sheet("Gof") modify + +putexcel A3 = "F1a - Fertility in initial education spell", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + +************************************************ +* F1b - Having a child, left initial edu spell * +************************************************ + +* Process F1b: Probabiltiy of having a child +* Sample: Women aged 18-44, left initial education spell +* DV: New born child dummy + +tab dchpd if (sprfm == 1 & ded == 0) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ /// + lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + (sprfm == 1 & ded == 0) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(F1b, dimlwt) side dec(4) + +probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ /// + lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + (sprfm == 1 & ded == 0) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1b, disclwt) side dec(4) + +probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ /// + lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + (sprfm == 1 & ded == 0) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_F1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(F1b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_F1b.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ + +probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ /// + lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// + (sprfm == 1 & ded == 0) [pweight=dimxwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/Fertility_w", sheet("Process F1b - Not in education") modify +putexcel set "$dir_raw_results/fertility/fertility", sheet("Process F1b - Not in education") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) - -outreg2 stats(coef se pval) using "$dir_data/F1b.doc", replace /// -title("Process F1b: Probability of giving birth to a child. Sample: Women aged 18-44 not in continuous education.") /// +outreg2 stats(coef se pval) using "$dir_raw_results/fertility/F1b.doc", replace /// +title("Process F1b: Probability of giving birth to a child. Sample: Women aged 18-44 not in initial education spell.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/F1b_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve +putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_fertility", sheet("UK_F1b") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_fertility", sheet("UK_F1b") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) -capture log close + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dag" +putexcel A3 = "Dag_sq" +putexcel A4 = "Ydses_c5_Q2_L1" +putexcel A5 = "Ydses_c5_Q3_L1" +putexcel A6 = "Ydses_c5_Q4_L1" +putexcel A7 = "Ydses_c5_Q5_L1" +putexcel A8 = "Dnc_L1" +putexcel A9 = "Dnc02_L1" +putexcel A10 = "Dhe_pcs" +putexcel A11 = "Dhe_mcs" +putexcel A12 = "Dcpst_Single_L1" +putexcel A13 = "Dcpst_PreviouslyPartnered_L1" +putexcel A14 = "Deh_c3_Medium" +putexcel A15 = "Deh_c3_Low" +putexcel A16 = "FertilityRate" +putexcel A17 = "Les_c3_Student_L1" +putexcel A18 = "Les_c3_NotEmployed_L1" +putexcel A19 = "UKC" +putexcel A20 = "UKD" +putexcel A21 = "UKE" +putexcel A22 = "UKF" +putexcel A23 = "UKG" +putexcel A24 = "UKH" +putexcel A25 = "UKJ" +putexcel A26 = "UKK" +putexcel A27 = "UKL" +putexcel A28 = "UKM" +putexcel A29 = "UKN" +putexcel A30 = "Year_transformed" +putexcel A31 = "Y2020" +putexcel A32 = "Y2021" +putexcel A33 = "Ethn_Asian" +putexcel A34 = "Ethn_Black" +putexcel A35 = "Ethn_Other" +putexcel A36 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dag" +putexcel D1 = "Dag_sq" +putexcel E1 = "Ydses_c5_Q2_L1" +putexcel F1 = "Ydses_c5_Q3_L1" +putexcel G1 = "Ydses_c5_Q4_L1" +putexcel H1 = "Ydses_c5_Q5_L1" +putexcel I1 = "Dnc_L1" +putexcel J1 = "Dnc02_L1" +putexcel K1 = "Dhe_pcs" +putexcel L1 = "Dhe_mcs" +putexcel M1 = "Dcpst_Single_L1" +putexcel N1 = "Dcpst_PreviouslyPartnered_L1" +putexcel O1 = "Deh_c3_Medium" +putexcel P1 = "Deh_c3_Low" +putexcel Q1 = "FertilityRate" +putexcel R1 = "Les_c3_Student_L1" +putexcel S1 = "Les_c3_NotEmployed_L1" +putexcel T1 = "UKC" +putexcel U1 = "UKD" +putexcel V1 = "UKE" +putexcel W1 = "UKF" +putexcel X1 = "UKG" +putexcel Y1 = "UKH" +putexcel Z1 = "UKJ" +putexcel AA1 = "UKK" +putexcel AB1 = "UKL" +putexcel AC1 = "UKM" +putexcel AD1 = "UKN" +putexcel AE1 = "Year_transformed" +putexcel AF1 = "Y2020" +putexcel AG1 = "Y2021" +putexcel AH1 = "Ethn_Asian" +putexcel AI1 = "Ethn_Black" +putexcel AJ1 = "Ethn_Other" +putexcel AK1 = "Constant" + + +* Goodness of fit +putexcel set "$dir_results/reg_fertility", sheet("Gof") modify +putexcel A9 = "F1b - Fertility left initial education spell", bold +putexcel A11 = "Pseudo R-squared" +putexcel B11 = r2_p +putexcel A12 = "N" +putexcel B12 = N +putexcel E11 = "Chi^2" +putexcel F11 = chi2 +putexcel E12 = "Log likelihood" +putexcel F12 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do b/input/InitialPopulations/compile/RegressionEstimates/reg_health.do index 7da6ae616..0a08a84cd 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_health.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_health.do @@ -1,9 +1,12 @@ ******************************************************************************** * PROJECT: ESPON * SECTION: Health -* OBJECT: Final Probit and Linear Regression Models - Weighted +* OBJECT: Health status and Disability * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +* NOTES: ******************************************************************************** clear all set more off @@ -17,7 +20,8 @@ set maxvar 30000 * DEFINE DIRECTORIES *******************************************************************************/ * Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +//global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates" * Directory which contains do files global dir_do "${dir_work}/do" @@ -29,137 +33,1032 @@ global dir_data "${dir_work}/data" global dir_log "${dir_work}/log" * Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - +//global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" +global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data" ******************************************************************* cap log close log using "${dir_log}/reg_health.log", replace ******************************************************************* - use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear +do "$dir_do/variable_update" -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" +* Sample selection +drop if dag < 16 -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable dhhtp_c4 "Household Type: 4 Category" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable dhe "Self-rated Health" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable ydses_c5 "Annual Household Income Quintile" -label variable dlltsd "Long-term Sick or Disabled" - -label value dgn gdr -label value drgn1 rgna -label value dhhtp_c4 hht -label value les_c3 jbf -label value deh_c3 edd -label value ded yn +* Set Excel file -drop if dag < 16 -replace stm = stm - 2000 +* Info sheet + +putexcel set "$dir_work/reg_health", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection self-reported health status" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold + +putexcel A5 = "H1a" +putexcel B5 = "Generalized ordered logit regression estimates of self reported health status - individuals aged 16-29 in initial education spell" +putexcel B6 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table" +putexcel B7 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name." + +putexcel A8 = "H1b" +putexcel B8 = "Generalized ordered logit regression estimates of self reported health status - individuals aged 16+ not in initial education spell" +putexcel B9 = "Covariates that satisfy the parallel lines assumption have one estimate for all categories of the dependent variable and are present once in the table" +putexcel B10 = "Covariates that do not satisfy the parallel lines assumption have an estimate for each estimated category of the dependent variable. These covariates have the dependent variable category appended to their name." + +putexcel A11 = "H2b" +putexcel B11 = "Probit regression estimates of the probability of being long-term sick or disabled - people aged 16+ not in initial education spell" + +putexcel A12 = "H1a_raw" +putexcel B12 = "Raw generalized ordered logit regression estimates of self reported health status - individuals aged 16-29 in initial education spell. Useful for the 'Gologit predictor' file." +putexcel A13 = "H1b_raw" +putexcel B13 = "Raw generalized ordered logit regression estimates of self reported health status - individuals aged 16+ not in initial education spell. Useful for the 'Gologit predictor' file." + +putexcel A15 = "Notes:", bold +putexcel B15 = "All processes: replaced lagged dhe with lagged dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot), covid dummies (y2020 y2021)" +putexcel B16 = "H1a and H1b: excluded those with imputed values of dhe" +putexcel B17 = "H1a: some covariates had to be dropped to obtain estimates; lagged income quintile is treated as continuous variable" +putexcel B18 = "H2b: used wider definition of disability (Dlltsd01), incl those declaring themselves as disabled or receiving disability benefits" + +putexcel set "$dir_work/reg_health", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold -/*check if all covariates are available in the data*/ -recode dhe deh_c3 les_c3 ydses_c5 dhhtp_c4 drgn1 stm (0= .) (-9=. ) -recode dgn dag dagsq (-9=.) xtset idperson swv +******************************************** +* H1a: Health status, in initial edu spell * +******************************************** + +* Process H1a: Probability of each self-rated health status for those who +* are in their initial education spell +* Sample: 16-29 year olds who are in their initial education spell +* DV: Categorical health status (5) -********************************** -*Process 1a: Those in education * -********************************** -* -*Self-rated health status for those in continuous education. -*sample: 16-29 year olds who have always been in education without a break fre dhe if (dag>=16 & dag<=29 & ded==1 ) -/* -regress dhe i.dgn dag dagsq li.ydses_c5 l.dhe ib8.drgn1 stm if scedsmpl==1 [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/health.xlsx", sheet("Process H1a - Self-rated Health") replace -putexcel A1 = matrix(results), names nformat(number_d2) -putexcel A1 = matrix(results), names nformat(number_d2) +/* Ordered probit models to replace linear regression +oprobit dhe i.dgn dag dagsq li.ydses_c5 ilb5.dhe ib8.drgn1 stm if (dag>=16 & dag<=29 & ded==1) [pweight=disclwt], vce(robust) */ -* Ordered probit models to replace linear regression -oprobit dhe i.dgn dag dagsq li.ydses_c5 ilb5.dhe ib8.drgn1 stm if (dag>=16 & dag<=29 & ded==1) [pweight=disclwt], vce(robust) +* Generalized ordered logit +gologit2 dhe i.Dgn Dag Dag_sq L_Ydses_c5 L_Dhe_pcs L_Dhe_mcs i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN Year_transformed Y2020 Y2021 i.Ethn_Asian i.Ethn_Black i.Ethn_Other /// + if dag >= 16 & dag <= 29 & ded == 1 & dhe_flag != 1 /// + [pweight = dimxwt], autofit +*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it. + + + *raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/health", sheet("Process H1a") replace -putexcel A3 = matrix(results), names nformat(number_d2) +putexcel set "$dir_raw_results/health/health", sheet("Process H1a") replace +putexcel A3 = matrix(results), names //nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/H1a.doc", replace /// -title("Process H1a: Ordered probit regression estimates of self reported health status - individuals aged 16-29 in continuous education") /// +outreg2 stats(coef se pval) using "$dir_raw_results/health/H1a.doc", replace /// +title("Process H1a: Generalised ordered logit regression estimates of self reported health status - individuals aged 16-29 in continuous education") /// ctitle(health status) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p1 p2 p3 p4 p5 + +* Save sample for later use (internal validation) +save "$dir_validation_data/H1a_sample", replace +* Store model summary statistics +scalar r2_p = e(r2_p) +scalar N_sample = e(N) + +* Store results in Excel -**************************************** -*Process 1b: Those in not in education * -**************************************** -* -*Self-rated health status for those not in continuous education (out of education or returned having left education in the past). -*sample: 16 or older who are not in continuous education -fre dhe if (dag>=16 & ded==0 ) -/* -regress dhe i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 l.dhe lib1.dhhtp_c4 ib8.drgn1 stm if scedsmpl==0 [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set health, sheet("Process H1b - Not in education") modify -putexcel A1 = matrix(results), names nformat(number_d2) -*/ +* Store estimates in matrices +matrix b = e(b) +matrix V = e(V) + +* Raw output +putexcel set "$dir_results/reg_health", sheet("H1a_raw") modify +putexcel A1 = matrix(b'), names //nformat(number_d2) +putexcel A1 = "CATEGORY" +putexcel B1 = "REGRESSOR" +putexcel C1 = "COEFFICIENT" + +* Estimated coefficients +scalar no_coefs_all = colsof(b) + +* Eliminate rows and columns containing zeros (baseline cats) +mata: + // Call matrices into mata + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + nonzero_b = select(b, keep) + + // Inspect + nonzero_b + + // Return to Stata + st_matrix("nonzero_b", nonzero_b) + st_matrix("nonzero_b_flag", keep) +end + +* Inspect +matrix list b +matrix list nonzero_b +matrix list nonzero_b_flag + +* Save dimensions +scalar no_nonzero_b = colsof(nonzero_b) +scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 + +* Address repetition of proportional odds covariates + +* Generate repetition/unique observation flag +mata: + // Import matrices into mata + nonzero_b_mata = st_matrix("nonzero_b") + + // Generate binary vector =1 if coefficient repeated + n = cols(nonzero_b_mata) + repetition_flag = J(n, 1, 0) + + // use tolerance based comparison to avoid precision errors + tol = 1e-8 + + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { + repetition_flag[i] = 1 + break + } + } + } + repetition_flag + + // Generate binary vector =1 if coefficient not repeated + unique_flag = 1 :- repetition_flag + + // Return to Stata + st_matrix("repetition_flag", repetition_flag') + st_matrix("unique_flag", unique_flag') + +end -* Ordered probit models to replace linear regression +* Generate vector to multiply the coef vector with to eliminate the +* repetitions of coefficients for vars that satify the proportional odds assumptions +matrix structure_a = J(1,no_nonzero_b_per,1) +matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] +matrix structure = structure_a, structure_b + +* Inspect +matrix list structure_a +matrix list structure_b +matrix list structure +matrix list nonzero_b + +* Eliminate repetitions +mata: + // Call matrices into mata + var = st_matrix("var") + structure = st_matrix("structure") + nonzero_b = st_matrix("nonzero_b") + + // Convert reptitions into zeros + b_structure = structure :* nonzero_b + + b_structure + + // Eliminate zeros + keep = (b_structure :!= 0) + + nonzero_b_structure = select(b_structure, keep) + + // Export to Stata + st_matrix("b_structure", b_structure) + st_matrix("nonzero_b_structure", nonzero_b_structure) + +end + +matrix list nonzero_b_structure + +* Export into Excel +putexcel set "$dir_results/reg_health", sheet("UK_H1a") modify +putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) + + + +* Variance-covariance matrix +* ELiminate zeros (baseline categories) +mata: + V = st_matrix("V") + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + V_trimmed + + // Return to Stata + st_matrix("var", V_trimmed) +end + +matrix list var + +* Address repetition due to proportional odds being satisfied for some covars +matrix square_structure_a = J(no_nonzero_b,1,1) * structure +matrix square_structure_b = square_structure_a' + +matrix list square_structure_a +matrix list square_structure_b +mata: + // Call matrices into mata + var = st_matrix("var") + + // Create structure matrix (0 = eliminate) + square_structure_a = st_matrix("square_structure_a") + square_structure_b = st_matrix("square_structure_b") + + // Element-by-element multiplication + square_structure = square_structure_a :* square_structure_b + var_structure = square_structure :* var + + // Eliminate zeros + row_keep = rowsum(abs(var_structure)) :!= 0 + col_keep = colsum(abs(var_structure)) :!= 0 + + nonzero_var_structure = select(select(var_structure, row_keep), col_keep) + + // Return to Stata + st_matrix("nonzero_var_structure", nonzero_var_structure) +end + +matrix list nonzero_var_structure + +* Export to Excel +putexcel set "$dir_results/reg_health", sheet("UK_H1a") modify +putexcel C2 = matrix(nonzero_var_structure) + + +* Labels +putexcel set "$dir_results/reg_health", sheet("UK_H1a") modify + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +/* Create temporary frame ==> not available in stata 14 +frame create temp_frame +frame temp_frame: { + + mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Create and clean labels + // Address lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Remove 1. + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + + // Constant + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add v1 + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Create temp file with results + fh = fopen("$dir_results/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) + end + */ + * Here's a replacement for stata 14: +local dir_results "$dir_results" + +preserve +* Run Mata block +mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Handle lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category name when flag is not unique + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Clean labels + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + // Filter for structure == 1 + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add header row + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Write to temporary file + fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) +end + + * Import cleaned labels into Stata as new dataset + import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) + gen n = _n + + * Export labels to Excel + putexcel set "$dir_results/reg_health", sheet("UK_H1a") modify + + * Vertical labels + sum n, meanonly + local N = r(max)+1 + + forvalue i = 2/`N' { + local j = `i' - 1 + putexcel A`i' = v1[`j'] + } + + * Horizontal labels + sum n, meanonly + local N = r(max) + 1 // Adjusted since we're working across columns + + forvalues j = 1/`N' { + local n = `j'+2 // Shift by 2 to start from column C + local col "" + + while `n' > 0 { + local rem = mod(`n' - 1, 26) + local col = char(65 + `rem') + "`col'" + local n = floor((`n' - 1)/26) + } + + putexcel `col'1 = v1[`j'] + } + + *Clean up + erase "$dir_results/temp_labels.txt" + + +* Export model fit statistics +putexcel set "$dir_results/reg_health", sheet("Gof") modify + +putexcel A3 = "H1a - Health status, in initial education spell", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N_sample + +restore +* Clean up +drop in_sample p1 p2 p3 p4 p5 +scalar drop _all +matrix drop _all +//frame drop temp_frame + + +****************************************************** +* Process H1b: Health status, left intital edu spell * +****************************************************** + +* Process H1b: Probability of each self-rated health status for those who +* have left their initial education spell +* Sample: 16 or older who have left their initial education spell +* DV: Categorical health status (5) + +/* Ordered probit models to replace linear regression oprobit dhe i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ilb5.dhe lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0) [pweight=disclwt], vce(robust) +*/ + + * Generalized ordered logit +sort idperson swv + +gologit2 dhe i.Dgn Dag Dag_sq /// +i.Deh_c3_Medium i.Deh_c3_Low /// + i.L_Les_c3_Student i.L_Les_c3_NotEmployed /// + /*L_Ydses_c5*/ i.L_Ydses_c5_Q2 i.L_Ydses_c5_Q3 i.L_Ydses_c5_Q4 i.L_Ydses_c5_Q5 /// + L_Dhe_pcs L_Dhe_mcs /// + i.L_Dhhtp_c4_CoupleChildren i.L_Dhhtp_c4_SingleNoChildren i.L_Dhhtp_c4_SingleChildren /// + i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN /// + Year_transformed Y2020 Y2021 /// + i.Ethn_Asian i.Ethn_Black i.Ethn_Other /// + if dhe_flag != 1 & /// + dag >= 16 & ded == 0 [pweight = dimxwt], autofit +*Note: In gologit2, the coefficients show how covariates affect the log-odds of being above a certain category vs. at or below it. + + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/health", sheet("Process H1b") modify +putexcel set "$dir_raw_results/health/health", sheet("Process H1b") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/H1b.doc", replace /// -title("Process H1b: Ordered probit regression estimates of self reported health status - individuals aged 16+ not in continuous education") /// +outreg2 stats(coef se pval) using "$dir_raw_results/health/H1b.doc", replace /// +title("Process H1b: Generalised Ordered logit regression estimates of self reported health status - individuals aged 16+ not in continuous education") /// ctitle(health status) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p1 p2 p3 p4 p5 + +* Save sample for later use (internal validation) +save "$dir_validation_data/H1b_sample", replace + +* Store model summary statistics +scalar r2_p = e(r2_p) +scalar N_sample = e(N) + +* Store results in Excel + +* Store estimates in matrices +matrix b = e(b) +matrix V = e(V) + +* Raw output +putexcel set "$dir_results/reg_health", sheet("H1b_raw") modify +putexcel A1 = matrix(b'), names //nformat(number_d2) +putexcel A1 = "CATEGORY" +putexcel B1 = "REGRESSOR" +putexcel C1 = "COEFFICIENT" + +* Estimated coefficients +scalar no_coefs_all = colsof(b) + +* Eliminate rows and columns containing zeros (baseline cats) +mata: + // Call matrices into mata + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + nonzero_b = select(b, keep) + + // Inspect + nonzero_b + + // Return to Stata + st_matrix("nonzero_b", nonzero_b) + st_matrix("nonzero_b_flag", keep) +end + +* Inspect +matrix list b +matrix list nonzero_b +matrix list nonzero_b_flag + +* Save dimensions +scalar no_nonzero_b = colsof(nonzero_b) +scalar no_nonzero_b_per = no_nonzero_b / 4 // number of categories-1 + +* Address repetition of proportional odds covariates + +* Generate repetition/unique observation flag +mata: + // Import matrices into mata + nonzero_b_mata = st_matrix("nonzero_b") + + // Generate binary vector =1 if coefficient repeated + n = cols(nonzero_b_mata) + repetition_flag = J(n, 1, 0) + + // use tolerance based comparison to avoid precision errors + tol = 1e-8 + + for (i = 1; i <= n; i++) { + for (j = 1; j <= n; j++) { + if (i != j && abs(nonzero_b_mata[i] - nonzero_b_mata[j]) < tol) { + repetition_flag[i] = 1 + break + } + } + } + repetition_flag + + // Generate binary vector =1 if coefficient not repeated + unique_flag = 1 :- repetition_flag + + // Return to Stata + st_matrix("repetition_flag", repetition_flag') + st_matrix("unique_flag", unique_flag') + +end + +* Generate vector to multiply the coef vector with to eliminate the +* repetitions of coefficients for vars that satify the proportional odds assumptions +matrix structure_a = J(1,no_nonzero_b_per,1) +matrix structure_b = unique_flag[1,no_nonzero_b_per+1..no_nonzero_b] +matrix structure = structure_a, structure_b + +* Inspect +matrix list structure_a +matrix list structure_b +matrix list structure +matrix list nonzero_b + +* Eliminate repetitions +mata: + // Call matrices into mata + var = st_matrix("var") + structure = st_matrix("structure") + nonzero_b = st_matrix("nonzero_b") + + // Convert reptitions into zeros + b_structure = structure :* nonzero_b + + b_structure + + // Eliminate zeros + keep = (b_structure :!= 0) + + nonzero_b_structure = select(b_structure, keep) + + // Export to Stata + st_matrix("b_structure", b_structure) + st_matrix("nonzero_b_structure", nonzero_b_structure) + +end + +matrix list nonzero_b_structure + +* Export into Excel +putexcel set "$dir_results/reg_health", sheet("UK_H1b") modify +putexcel A1 = matrix(nonzero_b_structure'), names //nformat(number_d2) + + + +* Variance-covariance matrix +* ELiminate zeros (baseline categories) +mata: + V = st_matrix("V") + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + V_trimmed + + // Return to Stata + st_matrix("var", V_trimmed) +end + +matrix list var + +* Address repetition due to proportional odds being satisfied for some covars +matrix square_structure_a = J(no_nonzero_b,1,1) * structure +matrix square_structure_b = square_structure_a' + +matrix list square_structure_a +matrix list square_structure_b +mata: + // Call matrices into mata + var = st_matrix("var") + + // Create structure matrix (0 = eliminate) + square_structure_a = st_matrix("square_structure_a") + square_structure_b = st_matrix("square_structure_b") + + // Element-by-element multiplication + square_structure = square_structure_a :* square_structure_b + var_structure = square_structure :* var + + // Eliminate zeros + row_keep = rowsum(abs(var_structure)) :!= 0 + col_keep = colsum(abs(var_structure)) :!= 0 + + nonzero_var_structure = select(select(var_structure, row_keep), col_keep) + + // Return to Stata + st_matrix("nonzero_var_structure", nonzero_var_structure) +end + +matrix list nonzero_var_structure + +* Export to Excel +putexcel set "$dir_results/reg_health", sheet("UK_H1b") modify +putexcel C2 = matrix(nonzero_var_structure) + + +* Labels +putexcel set "$dir_results/reg_health", sheet("UK_H1b") modify + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +/* Create temporary frame ==> not available in stata 14 +frame create temp_frame +frame temp_frame: { + + mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Create and clean labels + // Address lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Remove 1. + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + + // Constant + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add v1 + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Create temp file with results + fh = fopen("$dir_results/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) + end + */ + * Here's a replacement for stata 14: +local dir_results "$dir_results" + +preserve +* Run Mata block +mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + unique_flag = st_matrix("unique_flag")' + structure = st_matrix("structure")' + stripe = st_matrixcolstripe("e(b)") + + // Extract variable and category names + catnames = stripe[.,1] + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + catnames_no_bl = select(catnames, nonzero_b_flag :== 1) + + // Handle lags + labels_no_bl = regexm(varnames_no_bl, "^L_") :* (regexr(varnames_no_bl, "^L_", "") :+ "_L1") :+ (!regexm(varnames_no_bl, "^L_") :* varnames_no_bl) + + // Add category name when flag is not unique + labels_no_bl = labels_no_bl :+ "_" :+ (catnames_no_bl :* (unique_flag[1::rows(labels_no_bl)] :!= 0)) + + // Clean labels + labels_no_bl = usubinstr(labels_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + + // Filter for structure == 1 + nonzero_labels_structure = select(labels_no_bl, structure[1::rows(labels_no_bl)] :== 1) + + // Add header row + nonzero_labels_structure = "v1"\nonzero_labels_structure + + // Write to temporary file + fh = fopen(st_local("dir_results") + "/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) +end + + + + * Import cleaned labels into Stata as new dataset + import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) + gen n = _n + + * Export labels to Excel + putexcel set "$dir_results/reg_health", sheet("UK_H1b") modify + + * Vertical labels + sum n, meanonly + local N = r(max)+1 + + forvalue i = 2/`N' { + local j = `i' - 1 + putexcel A`i' = v1[`j'] + } + + * Horizontal labels + sum n, meanonly + local N = r(max) + 1 // Adjusted since we're working across columns + + forvalues j = 1/`N' { + local n = `j'+2 // Shift by 2 to start from column C + local col "" + + while `n' > 0 { + local rem = mod(`n' - 1, 26) + local col = char(65 + `rem') + "`col'" + local n = floor((`n' - 1)/26) + } + + putexcel `col'1 = v1[`j'] + } + + *Clean up + erase "$dir_results/temp_labels.txt" + + + * Export model fit statistics +putexcel set "$dir_results/reg_health", sheet("Gof") modify + +putexcel A9 = "H1b - Health status, left initial education spell", bold + +putexcel A11 = "Pseudo R-squared" +putexcel B11 = r2_p +putexcel A12 = "N" +putexcel B12 = N_sample + +restore +* Clean up +drop in_sample p1 p2 p3 p4 p5 +scalar drop _all +matrix drop _all +//frame drop temp_frame -********************************************************************************************** -*Process 2b: Probability of being long-term sick or disabled amongst those not in education * -********************************************************************************************** -* -*Probability of becoming long-term sick or disabled for those not in continuous education. -*sample: 16 or older who are not in continuous education -fre dhe if (dag>=16 & ded==0 ) -probit dlltsd i.dgn dag dagsq ib1.deh_c3 li.ydses_c5 ib5.dhe ilb5.dhe l.dlltsd lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0 & dag<56) [pweight=disclwt], vce(robust) +*********************************************************** +* H2b: Long-term sick or disabled, left initial edu spell * +*********************************************************** + +* Process H2b: Probability of being long-term sick or disabled for those +* not in continuous education. +* Sample: 16 or older who have left their initial education spell +* DV: Long term sick/disabled dummy ==> plus those on disability benefits +tab2 dlltsd dlltsd01 + +fre dlltsd if (dag >= 16 & ded == 0) +fre dlltsd01 if (dag >= 16 & ded == 0) +fre les* if dlltsd01==1 +/*fre les* if dlltsd01==1 +les_c4 -- LABOUR MARKET: Activity status +--------------------------------------------------------------------------------- + | Freq. Percent Valid Cum. +------------------------------------+-------------------------------------------- +Valid 1 Employed or self-employed | 5549 11.46 11.47 11.47 + 2 Student | 646 1.33 1.34 12.81 + 3 Not employed | 24806 51.25 51.28 64.09 + 4 Retired | 17368 35.88 35.91 100.00 + Total | 48369 99.93 100.00 +Missing . | 32 0.07 +Total | 48401 100.00 +--------------------------------------------------------------------------------- +*/ + +/*probit dlltsd01 i.dgn dag dagsq ib1.deh_c3 li.ydses_c5 ib5.dhe ilb5.dhe l.dlltsd lib1.dhhtp_c4 ib8.drgn1 stm if (dag>=16 & ded==0) [pweight=disclwt], vce(robust) */ + +probit dlltsd01 i.Dgn Dag Dag_sq /// + i.Deh_c3_Medium i.Deh_c3_Low /// + li.Ydses_c5_Q2 li.Ydses_c5_Q3 li.Ydses_c5_Q4 li.Ydses_c5_Q5 /// + Dhe_pcs Dhe_mcs /// + L_Dhe_pcs L_Dhe_mcs /// + l.Dlltsd01 /// + li.Dhhtp_c4_CoupleChildren li.Dhhtp_c4_SingleNoChildren li.Dhhtp_c4_SingleChildren /// + i.UKC i.UKD i.UKE i.UKF i.UKG i.UKH i.UKJ i.UKK i.UKL i.UKM i.UKN /// + Year_transformed Y2020 Y2021 /// + i.Ethn_Asian i.Ethn_Black i.Ethn_Other /// +if (dag >= 16 & ded == 0) /// + [pweight = dimxwt], vce(robust) + + + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/health", sheet("Process H2b") modify +putexcel set "$dir_raw_results/health/health", sheet("Process H2b") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/H2b.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/health/H2b.doc", replace /// title("Process H2b: Probit regression estimates for being long-term sick or disabled - people aged 16+ not in continuous education") /// ctitle(long-term sick or disabled) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) +predict p +* Save sample for later use (internal validation) +save "$dir_validation_data/H2b_sample", replace + +* Store model summary statistics +scalar r2_p = e(r2_p) +scalar N_sample = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + +* Store results in Excel + +* Store estimates in matrices +matrix b = e(b) +matrix V = e(V) + +* Eliminate rows and columns containing zeros (baseline cats) +mata: + // Call matrices into mata + V = st_matrix("V") + b = st_matrix("b") + + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + b_trimmed = select(b, keep) + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + // Inspection + b_trimmed + V_trimmed + + // Return to Stata + st_matrix("b_trimmed", b_trimmed') + st_matrix("V_trimmed", V_trimmed) + st_matrix("nonzero_b_flag", keep) +end + +* Export into Excel +putexcel set "$dir_results/reg_health", sheet("UK_H2b") modify +putexcel B2 = matrix(b_trimmed) +putexcel C2 = matrix(V_trimmed) + + +* Labels +putexcel set "$dir_results/reg_health", sheet("UK_H2b") modify + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + + +/* Use frame and Mata to extract nice labels from colstripe of e(b) ==> not working in stata 14 +frame create temp_frame +frame temp_frame: { + + mata: + // Import matrices from Stata + nonzero_b_flag = st_matrix("nonzero_b_flag")' + stripe = st_matrixcolstripe("e(b)") + + // Extract and variable and category names + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + + // Create label vector + labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + labels_no_bl = regexm(labels_no_bl, "^L\.") :* (regexr(labels_no_bl, "^L\.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "^L\.") :* labels_no_bl) + labels_no_bl = regexm(labels_no_bl, "^1L.") :* (regexr(labels_no_bl, "^1L.", "") :+ "_L1") :+ (!regexm(labels_no_bl, "1L.") :* labels_no_bl) + labels_no_bl = regexr(labels_no_bl, "_Dgn_L1$", "_Dgn") + + labels_no_bl + + nonzero_labels_structure = "v1"\labels_no_bl + + // Create temp file + fh = fopen("$dir_results/temp_labels.txt", "w") + for (i=1; i<=rows(nonzero_labels_structure); i++) { + fput(fh, nonzero_labels_structure[i]) + } + fclose(fh) + end +*/ +* STATA 14-COMPATIBLE LABEL EXTRACTION AND FILE EXPORT +* Mata: extract and clean labels +mata: + // Import matrices + nonzero_b_flag = st_matrix("nonzero_b_flag")' + stripe = st_matrixcolstripe("e(b)") + + // Extract varnames from stripe (2nd column) + varnames = stripe[.,2] + varnames_no_bl = select(varnames, nonzero_b_flag :== 1) + + // Clean label vector + labels_no_bl = usubinstr(varnames_no_bl, "1.", "", 1) + labels_no_bl = regexr(labels_no_bl, "^_cons", "Constant") + labels_no_bl = regexm(labels_no_bl, "^L\\.") :* (regexr(labels_no_bl, "^L\\.", "") :+ "_L1") :+ /// + (!regexm(labels_no_bl, "^L\\.") :* labels_no_bl) + labels_no_bl = regexm(labels_no_bl, "^1L\\.") :* (regexr(labels_no_bl, "^1L\\.", "") :+ "_L1") :+ /// + (!regexm(labels_no_bl, "^1L\\.") :* labels_no_bl) + labels_no_bl = regexr(labels_no_bl, "_Dgn_L1$", "_Dgn") + + // Save as macro for writing labels from Stata + st_local("nice_labels", invtokens(labels_no_bl')) +end + +* Save cleaned labels into your original file +capture file close labelout +file open labelout using "$dir_results/temp_labels.txt", write replace +file write labelout "v1" _n // header for import +foreach lbl in `nice_labels' { + file write labelout "`lbl'" _n +} +file close labelout + +* Import cleaned labels from your file +import delimited "$dir_results/temp_labels.txt", clear varnames(1) encoding(utf8) +gen n = _n + +* Export to Excel (vertical layout in column A) +putexcel set "$dir_results/reg_health", sheet("UK_H2b") modify +summarize n, meanonly +local N = r(max) + 1 +forvalue i = 2/`N' { + local j = `i' - 1 + putexcel A`i' = v1[`j'] +} + +* Export to Excel (horizontal layout in row 1, starting at column C) +forvalues j = 1/`N' { + local n = `j' + 2 // shift index: col C = 3 + local col "" + local nn = `n' + while `nn' > 0 { + local rem = mod(`nn' - 1, 26) + local col = char(65 + `rem') + "`col'" + local nn = floor((`nn' - 1)/26) + } + putexcel `col'1 = v1[`j'] +} + +* Clean up original file +erase "$dir_results/temp_labels.txt" + + +* Export model fit statistics +putexcel set "$dir_results/reg_health", sheet("Gof") modify + +putexcel A15 = "H2b - Long-term sick/disabled or on disability benefits, left initial edu spell", bold +putexcel A17 = "Pseudo R-squared" +putexcel B17 = r2_p +putexcel A18 = "N" +putexcel B18 = N_sample +putexcel E17 = "Chi^2" +putexcel F17 = chi2 +putexcel E18 = "Log likelihood" +putexcel F18 = ll + +* Clean up +//drop in_sample p +scalar drop _all +matrix drop _all +//frame drop temp_frame + capture log close + +cap erase "$dir_results/temp.dta" + diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do index 52df228ae..2e12f3e4e 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do @@ -3,7 +3,11 @@ * SECTION: Home ownership * OBJECT: Final Regresion Models - Weighted * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* LAST UPDATE: 15 May 2025 DP +* COUNTRY: UK +* +* NOTES: Removed spousal education to include singles, combined it with hh composition instead, added lagged home ownership as a predictor +* ******************************************************************************** clear all set more off @@ -12,90 +16,270 @@ set type double //set maxvar 120000 set maxvar 30000 +******************************************************************* +cap log close +log using "${dir_log}/reg_home_ownership.log", replace +******************************************************************* -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -* Directory which contains do files -global dir_do "${dir_work}/do" +do "$dir_do/variable_update" -* Directory which contains data files -global dir_data "${dir_work}/data" -* Directory which contains log files -global dir_log "${dir_work}/log" +*sample selection +drop if dag < 16 -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" +xtset idperson swv -******************************************************************* -cap log close -log using "${dir_log}/reg_home_ownership.log", replace -******************************************************************* -use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear +* Set Excel file -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable stm "Year" -label variable les_c3 "Employment Status: 5 Category" -label variable dhe "Self-rated Health" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable dhhtp_c4 "Household Type: 4 Category" - -label value dgn gdr -label value drgn1 rgna -label value les_c3 lessp_c3 jbf -label value deh_c3 dehsp_c3 edd -label value dcpen dcpex dlrtrd yn -label value dhhtp_c4 hht +* Info sheet -drop if dag < 16 -replace stm = stm - 2000 +putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection of home ownership" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "HO1a" +putexcel B5 = "Probit regression estimates of the probability of being a home owner, aged 18+" -*check if all covariates are available and recode missing values -recode dhh_owned dgn dag dagsq les_c3 deh_c3 dhe yptciihs_dv ydses_c5 drgn1 dhhtp_c4 lessp_c3 stm (-9=.) +putexcel A10 = "Notes:", bold +putexcel B10 = "Have combined dhhtp_c4 and lessp_c3 into a single variable with 8 categories, dhhtp_c8" +putexcel B11 = "Added lagged home ownership, replaced dhe with dhe_pcs and dhe_mcs, added ethnicity (dot) and covid dummies (y2020 2021)" -xtset idperson swv +putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +************************ +* HO1a: Home ownership * +************************ + +* Process HO1a: Probability of being a home owner +* Sample: Individuals aged 18+ +* DV: Home ownerhip dummy + +fre dhh_owned if dag >= 18 +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// +i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// +dag>=18 [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(HO1a, dimlwt) side dec(4) -************************************************* -*Process HO1: Probability of being a homeowner. * -************************************************* -*Sample: Individuals aged 16 and above. +probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// +i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// +dag>=18 [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(HO1a, disclwt) side dec(4) -probit dhh_owned dgn dag dagsq il.dhhtp_c4 il.les_c3 il.lessp_c3 i.deh_c3 il.dhe il.ydses_c5 l.yptciihs_dv ib8.drgn1 stm if dag>=16 [pweight=disclwt], vce(cluster idperson) +probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// +i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// +dag>=18 [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(HO1a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_HO1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ + +probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// +i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// +dag>=18 [pweight=dimxwt], vce(cluster idperson) + + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/homeownership", sheet("Process HO1a") replace +putexcel set "$dir_raw_results/home_ownership/homeownership", sheet("Process HO1a") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace /// -title("Process HO1a: Probability of being a home owner - people aged 16+") /// +outreg2 stats(coef se pval) using "$dir_raw_results/home_ownership/HO1a.doc", replace /// +title("Process HO1a: Probability of being a home owner - individuals aged 18+") /// ctitle(home owner) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/HO1a_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/home_ownership/var_cov", sheet("var_cov") /// + replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/home_ownership/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_home_ownership", sheet("UK_HO1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_home_ownership", sheet("UK_HO1a") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Dhhtp_c8_2_L1" +putexcel A6 = "Dhhtp_c8_3_L1" +putexcel A7 = "Dhhtp_c8_4_L1" +putexcel A8 = "Dhhtp_c8_5_L1" +putexcel A9 = "Dhhtp_c8_6_L1" +putexcel A10 = "Dhhtp_c8_7_L1" +putexcel A11 = "Dhhtp_c8_8_L1" +putexcel A12 = "Les_c3_Student_L1" +putexcel A13 = "Les_c3_NotEmployed_L1" +putexcel A14 = "Deh_c3_Medium" +putexcel A15 = "Deh_c3_Low" +putexcel A16 = "Dhe_mcs" +putexcel A17 = "Dhe_pcs" +putexcel A18 = "Ydses_c5_Q2_L1" +putexcel A19 = "Ydses_c5_Q3_L1" +putexcel A20 = "Ydses_c5_Q4_L1" +putexcel A21 = "Ydses_c5_Q5_L1" +putexcel A22 = "Yptciihs_dv_L1" +putexcel A23 = "Dhh_owned_L1" +putexcel A24 = "UKC" +putexcel A25 = "UKD" +putexcel A26 = "UKE" +putexcel A27 = "UKF" +putexcel A28 = "UKG" +putexcel A29 = "UKH" +putexcel A30 = "UKJ" +putexcel A31 = "UKK" +putexcel A32 = "UKL" +putexcel A33 = "UKM" +putexcel A34 = "UKN" +putexcel A35 = "Year_transformed" +putexcel A36 = "Y2020" +putexcel A37 = "Y2021" +putexcel A38 = "Ethn_Asian" +putexcel A39 = "Ethn_Black" +putexcel A40 = "Ethn_Other" +putexcel A41 = "Constant" + +putexcel B1 = "COFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Dhhtp_c8_2_L1" +putexcel G1 = "Dhhtp_c8_3_L1" +putexcel H1 = "Dhhtp_c8_4_L1" +putexcel I1 = "Dhhtp_c8_5_L1" +putexcel J1 = "Dhhtp_c8_6_L1" +putexcel K1 = "Dhhtp_c8_7_L1" +putexcel L1 = "Dhhtp_c8_8_L1" +putexcel M1 = "Les_c3_Student_L1" +putexcel N1 = "Les_c3_NotEmployed_L1" +putexcel O1 = "Deh_c3_Medium" +putexcel P1 = "Deh_c3_Low" +putexcel Q1 = "Dhe_mcs" +putexcel R1 = "Dhe_pcs" +putexcel S1 = "Ydses_c5_Q2_L1" +putexcel T1 = "Ydses_c5_Q3_L1" +putexcel U1 = "Ydses_c5_Q4_L1" +putexcel V1 = "Ydses_c5_Q5_L1" +putexcel W1 = "Yptciihs_dv_L1" +putexcel X1 = "Dhh_owned_L1" +putexcel Y1 = "UKC" +putexcel Z1 = "UKD" +putexcel AA1 = "UKE" +putexcel AB1 = "UKF" +putexcel AC1 = "UKG" +putexcel AD1 = "UKH" +putexcel AE1 = "UKJ" +putexcel AF1 = "UKK" +putexcel AG1 = "UKL" +putexcel AH1 = "UKM" +putexcel AI1 = "UKN" +putexcel AJ1 = "Year_transformed" +putexcel AK1 = "Y2020" +putexcel AL1 = "Y2021" +putexcel AM1 = "Ethn_Asian" +putexcel AN1 = "Ethn_Black" +putexcel AO1 = "Ethn_Other" +putexcel AP1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify + +putexcel A3 = "HO1a - Home ownership", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + capture log close + diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do b/input/InitialPopulations/compile/RegressionEstimates/reg_income.do index 138e19009..b86f43be1 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_income.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_income.do @@ -1,9 +1,23 @@ ******************************************************************************** * PROJECT: ESPON * SECTION: Non-employment/non-benefit income -* OBJECT: Final Regresion Models - Weighted -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* OBJECT: Final Regresion Models +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven +* LAST UPDATE: 3 July 2025 DP +* COUNTRY: UK + +* NOTES: Models for split income variable +* The goal is to split the current non-labour non-benefit income variable into 3 components +* (capital returns, occupational pension, public pension) and estimate each of them separately, +* using (if possible) current set of controls. We have decided to abstain from estimating transfers at the moment. +* +* The income do file must be run after +* the wage estimates are obtain because they use +* predicted wages. +/******************************************************************************* + + +*******************************************************************************/ ******************************************************************************** clear all set more off @@ -17,7 +31,8 @@ set maxvar 30000 * DEFINE DIRECTORIES *******************************************************************************/ * Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +//global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" +global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates" * Directory which contains do files global dir_do "${dir_work}/do" @@ -29,16 +44,15 @@ global dir_data "${dir_work}/data" global dir_log "${dir_work}/log" * Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - +//global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" +global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data" ******************************************************************* cap log close log using "${dir_log}/reg_income.log", replace ******************************************************************* - -import excel "$dir_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real growth index +import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_gdp") firstrow clear // Import real growth index rename Year stm rename Value growth gen base_val = growth if stm == 2015 @@ -47,370 +61,1103 @@ replace base_val = r(mean) replace growth= growth/base_val drop base_val replace stm = stm - 2000 -save "$dir_data\growth_rates", replace - -use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear - -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable dhhtp_c4 "Household Type: 4 Category" -label variable dnc "Number of Children in Household" -label variable dnc02 "Number of Children aged 0-2 in Household" -label variable dhe "Self-rated Health" -label variable ydses_c5 "Annual Household Income Quintile" -label variable dlltsd "Long-term Sick or Disabled" -label variable dcpen "Entered a new Partnership" -label variable dcpex "Partnership dissolution" -label variable lesdf_c4 "Differntial Employment Status" -label variable ypnbihs_dv "Personal Non-benefit Gross Income" - -gen ypnbihs_dv_sq =ypnbihs_dv^2 - -label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared" -label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income" - -label value dgn gdr -label value drgn1 rgna -label value les_c3 jbf -label value deh_c3 edd -label value dcpen dcpex yn -label value lesdf_c4 dces -label value ded dlltsd yn -label value dhhtp_c4 hht +save "$dir_external_data\growth_rates", replace + +use "$dir_ukhls_data/ukhls_pooled_all_obs_10.dta", clear //note this is a pooled dataset after Heckman has been estimated -drop if dag < 16 -//replace stm = stm - 2000 sort stm -merge m:1 stm using "$dir_data/growth_rates", keep(3) nogen keepusing(growth) +merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(growth) +do "$dir_do/variable_update" -/********************************************************************** -CLEAN UP VARIABLES FOR REGRESSIONS -***********************************************************************/ -recode dgn dag dagsq dhe drgn1 stm scedsmpl deh_c3 les_c3 dhhtp_c4 dhe (-9=.) -sum yplgrs_dv ypncp ypnoab pred_hourly_wage +*sample selection +drop if dag < 16 xtset idperson swv -/* -***************************************************************** -*Process I1a: Non-employment income - In continuous education * -***************************************************************** -regress yptciihs_dv i.dgn dag dagsq l.dhe l.yptciihs_dv ib8.drgn1 stm if scedsmpl==1 [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/Income_mdls", sheet("Income - In education") replace -putexcel A1 = matrix(results), names nformat(number_d2) -*predict fittedice -*histogram fittedice -*histogram yptciihs_dv - -*Getting Variance Covariance Matrix -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/income_vcm", sheet("Process I1a - In education") replace -putexcel A1 = matrix(i1a), names -******************************************************************* -*Process I1b: Non-employment income - Not in continuous education * -******************************************************************* -regress yptciihs_dv i.dgn dag dagsq ib1.deh_c3 i.dlrtrd li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.yptciihs_dv /// -l2.yplgrs_dv l2.yptciihs_dv l3.yplgrs_dv l3.yptciihs_dv ib8.drgn1 stm if scedsmpl==0 [pweight=disclwt], vce(robust) -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/Income_mdls", sheet("Income - Not in education") modify -putexcel A1 = matrix(results), names nformat(number_d2) -*predict fittednice -*histogram fittednice -*histogram yptciihs_dv - -*Getting Variance Covariance Matrix -matrix i1b=get(VCE) -matrix list i1b -putexcel set "$dir_data/income_vcm", sheet("Process I1b - Not in education") modify -putexcel A1 = matrix(i1b), names -*/ - - -/******************************************************************************* - -New models for split income variable -The goal is to split the current non-labour non-benefit income variable into 3 components -(capital returns, occupational pension, public pension) and estimate each of them separately, -using (if possible) current set of controls. We have decided to abstain from estimating transfers at the moment. - -*******************************************************************************/ -bys swv idhh: gen nwa = _N -*Replace l.dhe with dhe if aged 16 -gsort +idperson -stm -bys idperson: carryforward dhe if dag <= 16, replace - -//For those who are 16, L1 of the variables below is missing as they were 15 at the time. Use current value to keep them in the sample. -sort idperson swv -bys idperson: gen dhe_L1 = l.dhe -replace dhe_L1 = dhe if missing(dhe_L1) //For those who have L1.dhe missing, use current dhe - -bys idperson: gen yplgrs_L1 = l.yplgrs_dv -replace yplgrs_L1 = yplgrs_dv if missing(yplgrs_L1) - -bys idperson: gen ypncp_L1 = l.ypncp -replace ypncp_L1 = ypncp if missing(ypncp_L1) - -bys idperson: gen yplgrs_L2 = l2.yplgrs_dv -replace yplgrs_L2 = yplgrs_dv if missing(yplgrs_L2) - -bys idperson: gen ypncp_L2 = l2.ypncp -replace ypncp_L2 = ypncp if missing(ypncp_L2) - -bys idperson: gen dhhtp_c4_L1 = l.dhhtp_c4 -replace dhhtp_c4_L1 = dhhtp_c4 if missing(dhhtp_c4_L1) - -bys idperson: gen les_c3_L1 = l.les_c3 -replace les_c3_L1 = les_c3 if missing(les_c3_L1) +* Set Excel file + +* Info sheet +putexcel set "$dir_results/reg_income", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "This file contains regression estiamtes used by processes I3 (capital income), I4 (private pension, retired last year), I5 (private pension income, not retired last year) " +putexcel A2 = "Authors: Patryk Bronka, Justin Van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "Process I3a selection" +putexcel B5 = "Logit regression estimates of the probability of receiving capital income - aged 16+ in initial education spell" +putexcel A6 = "Process I3b selection" +putexcel B6 = "Logit regression estimates of the probability of receiving capital income - aged 16+ not in initial education spell" +putexcel A7 = "Process I3a amount" +putexcel B7 = "OLS regression estimates (log) capital income amount - aged 16+ in initial education spell and receive capital income" +putexcel A8 = "Process I3b amount" +putexcel B8 = "OLS regression estimates (log) capital income amount - not in initial education spell and receive capital income" +putexcel A9 = "Process I4b amount" +putexcel B9 = "OLS regression estimates (log) private pension income - aged 50+ and were retired last year, receive private pension income" +putexcel A10 = "Process I5a selection" +putexcel B10 = "Logit regression estimates of the probability of receiving private pension income - aged 50+ and not a student or retired last year" +putexcel A11 = "Process I5a amount" +putexcel B11 = "OLS regression estimates (log) private pension income - aged 50+ and not a student or retired last year" + + +putexcel A15 = "Notes:", bold +putexcel B15 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot) and Covid dummies (y2020 y2021)" +putexcel B16 = "All processes: reverted to using stm instead of GDP growth" +putexcel B17 = "All processes for amounts: moved to log transformation" /********************************************************************** -SELECTION MODELS FOR CAPITAL INCOME +CAPITAL INCOME ***********************************************************************/ ***************************************************************** -*Process I3a selection: Probability of receiving capital income. +*I3a selection: Probability of receiving capital income, in initial edu spell ***************************************************************** -*Sample: Individuals aged 16 - 29 who are in continuous education. -gen receives_ypncp = (ypncp > 0 & !missing(ypncp)) -logit receives_ypncp i.dgn dag dagsq l.dhe l.yplgrs_dv l.ypncp ib8.drgn1 stm if scedsmpl==1 [pweight=dimxwt], vce(cluster idperson) base +* Sample: All individuals 16+ that are in initial edu spell +* DV: Receiving capital income dummy +* Note: Capital income and employment income variables in IHS version + +logit receives_ypncp i.dgn dag dagsq /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 yplgrs_dv_L1 ypncp_L1 ib8.drgn1 stm y2020 y2021 i.dot /// + if ded == 1 & dag >= 16 [pweight=dimxwt], /// + vce(cluster idperson) base +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I3a_selection E") replace +putexcel set "$dir_raw_results/income/income_split", sheet("Process I3a_selection E") replace putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3a_selection VCE") replace -putexcel A1 = matrix(i1a), names - -outreg2 stats(coef se pval) using "$dir_data/I3a_sel.doc", replace /// -title("Process I3a selection: Probability of receiving capital income. Sample: Individuals aged 16 - 29 who are in continuous education.") /// +matrix i3a=get(VCE) +matrix list i3a +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3a_selection VCE") replace +putexcel A1 = matrix(i3a), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I3a_sel.doc", replace /// +title("Process I3a selection: Probability of receiving capital income. Sample: Individuals aged 16+ who are in initial education spell.") /// ctitle(Probability of capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -******************************************************************** -*Process I3b selection: Probability of receiving capital income. -******************************************************************** -*Sample: Individuals aged 16+ who are not in continuous education. - -logit receives_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.ypncp l2.yplgrs_dv /// -l2.ypncp ib8.drgn1 stm if scedsmpl==0 [pweight=dimxwt], vce(cluster idperson) base - +cap drop in_sample +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/I3a_selection_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("I3a_selection") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I3a_selection") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A3 = /// + "I3a selection - Receiving capital income in initial education spell ", /// + bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +********************************************************************* +* I3b selection: Probability of receiving capital income, not in initial edu spell * +********************************************************************* +* Sample: All individuals 16+, not in initial edu spell +* DV: Receiving capital income dummy +* Note: Capital income and employment income variables in IHS version + +logit receives_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 /// +yplgrs_dv_L1 ypncp_L1 yplgrs_dv_L2 ypncp_L2 ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot /// + if ded == 0 [pweight=dimxwt], /// + vce(cluster idperson) base + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I3b_selection E") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I3b_selection E") replace putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3b_selection VCE") modify -putexcel A1 = matrix(i1a), names - -outreg2 stats(coef se pval) using "$dir_data/I3b_sel.doc", replace /// -title("Process I3b selection: Probability of receiving capital income. Sample: Individuals aged 16+ who are not in continuous education.") /// +matrix i3b=get(VCE) +matrix list i3b +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3b_selection VCE") replace +putexcel A1 = matrix(i3b), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I3b_sel.doc", replace /// +title("Process I3b selection: Probability of receiving capital income. Sample: Individuals aged who are not in initial education spell.") /// ctitle(Probability of capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) -/**********************************************************************/ -******************************************** -*Process I3a: Amount of capital income. -******************************************** -*Sample: Individuals aged 16 - 29 who are in continuous education and receive capital income. -*Using same controls as Cara - use of lags means those observed for the first time are not taken into account - -regress ypncp i.dgn dag dagsq l.dhe l.yplgrs_dv l.ypncp ib8.drgn1 stm if scedsmpl==1 & receives_ypncp == 1 [pweight=dimxwt], /// -vce(cluster idperson) base +//cap drop in_sample +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/I3b_selection_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I3b_selection") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I3b_selection") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling +// Need to variable label when add new variable to model. Order matters. + +local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low Les_c4_Student_L1 /// + Les_c4_NotEmployed_L1 Les_c4_Retired_L1 Dhhtp_c4_CoupleChildren_L1 /// + Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 /// + Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 Yplgrs_dv_L2 Ypncp_L2 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A9 = /// + "I3b selection - Receiving capital income left initial education spell ", /// + bold + +putexcel A11 = "Pseudo R-squared" +putexcel B11 = r2_p +putexcel A12 = "N" +putexcel B12 = N +putexcel E11 = "Chi^2" +putexcel F11 = chi2 +putexcel E12 = "Log likelihood" +putexcel F12 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +******************************************************* +* I3a: Amount of capital income, in initial edu spell * +******************************************************* +* Sample: All individuals 16+ that received capital income, in initial education spell +* DV: IHS of capital income + +regress ln_ypncp i.dgn dag dagsq /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 yplgrs_dv_L1 ypncp_L1 /// +ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot if dag >= 16 & receives_ypncp == 1 & ded == 1 /// + [pweight = dimxwt], vce(cluster idperson) + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I3a CapIn E") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I3a_amount E") replace putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3a CapIn E VCE") modify -putexcel A1 = matrix(i1a), names - -outreg2 stats(coef se pval) using "$dir_data/I3a.doc", replace /// -title("Process I3a: Amount of capital income. Sample: Individuals aged 16 - 29 who are in continuous education and receive capital income.") /// - ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) - -******************************************* -*Process I3b: Amount of capital income. -******************************************* -*Sample: Individuals aged 16+ who are not in continuous education and receive capital income. -*Using same controls as Cara -regress ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c3 lib1.dhhtp_c4 l.dhe l.yplgrs_dv l.ypncp l2.yplgrs_dv l2.ypncp ib8.drgn1 stm /// - if scedsmpl==0 & receives_ypncp == 1 [pweight=dimxwt], vce(cluster idperson) base +matrix i3a=get(VCE) +matrix list i3a +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3a_amount VCE") replace +putexcel A1 = matrix(i3a), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I3a.doc", replace /// +title("Process I3a: Amount of capital income. Sample: Individuals aged 16+ who are in initial education spell abd receive capital income.") /// + ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) + + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p +gen sigma = e(rmse) + +save "$dir_validation_data/I3a_level_sample", replace + +scalar r2 = e(r2) +scalar N = e(N) +scalar rmse= e(rmse) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I3a_amount") modify +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I3a_amount") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 /// +UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// +Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A6 = ("I3a") B6 = rmse + + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A15 = /// + "I3a level - Receiving capital income in initial education spell ", /// + bold + +putexcel A17 = "R-squared" +putexcel B17 = r2 +putexcel A18 = "N" +putexcel B18 = N + +drop in_sample p sigma +scalar drop r2 N + + +*********************************************************** +* I3b: Amount of capital income, not in initial edu spell * +*********************************************************** +* Sample: Individuals aged 16+ who are not in their initial education spell and +* receive capital income. + +regress ln_ypncp i.dgn dag dagsq ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 /// + yplgrs_dv_L1 ypncp_L1 yplgrs_dv_L2 ypncp_L2 ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot /// + if ded == 0 & receives_ypncp == 1 [pweight = dimxwt], /// + vce(cluster idperson) + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I3b CapIn NiE") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I3b_amount E") replace putexcel A1 = matrix(results), names nformat(number_d2) +matrix i3b=get(VCE) +matrix list i3b +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I3b_amount VCE") replace +putexcel A1 = matrix(i3b), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I3b.doc", replace /// +title("Process I3b: Amount of capital income. Sample: Individuals aged 16+ who are not in initial education spell abd receive capital income.") /// + ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) + + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p +gen sigma = e(rmse) + +save "$dir_validation_data/I3b_level_sample", replace + +scalar r2 = e(r2) +scalar N = e(N) +scalar rmse= e(rmse) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I3b_amount") modify +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I3b_amount") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low Les_c4_Student_L1 /// + Les_c4_NotEmployed_L1 Les_c4_Retired_L1 Dhhtp_c4_CoupleChildren_L1 /// + Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 /// + Dhe_pcs_L1 Dhe_mcs_L1 Yplgrs_dv_L1 Ypncp_L1 Yplgrs_dv_L2 Ypncp_L2 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A7 = ("I3b") B7 = rmse + + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A21 = /// + "I3b level - Receiving capital income left initial education spell ", /// + bold + +putexcel A23 = "R-squared" +putexcel B23 = r2 +putexcel A24 = "N" +putexcel B24 = N + +drop in_sample p sigma +scalar drop r2 N -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I3b CapIn NiE VCE") modify -putexcel A1 = matrix(i1a), names - -outreg2 stats(coef se pval) using "$dir_data/I3b.doc", replace /// -title("Process I3b: Amount of capital income. Sample: Individuals aged 16+ who are not in continuous education and receive capital income.") /// -ctitle(Amount of capital income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) - -replace les_c3 = 4 if dlrtrd == 1 - -label define jbf 4 "Retired", add /********************************************************************** PRIVATE PENSION INCOME ***********************************************************************/ + *************************************************** -*Process I4b: Amount of pension income. +*I4b: Amount of pension income. *************************************************** *Sample: Retired individuals who were retired in the previous year. -gen state_pension_age = (dag >= 68) -gen receives_ypnoab = (ypnoab_lvl > 0 & !missing(ypnoab_lvl)) -regress ypnoab dag dagsq ib1.deh_c3 lib1.dhhtp_c4 l.dhe l.ypnoab l2.ypnoab ib8.drgn1 c.growth stm /// -if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 [pweight=dimxwt], vce(cluster idperson) base +regress ln_ypnoab i.dgn dag dagsq ib1.deh_c3 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 /// +ypnoab_L1 ypnoab_L2 ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot /// +if dag >= 50 & receives_ypnoab & dlrtrd==1 & l.dlrtrd==1 [pweight=dimxwt], /// +vce(cluster idperson) base + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I4b Pension Next") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I4b_amount E") replace putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I4b Pension Next VCE") modify -putexcel A1 = matrix(i1a), names - -outreg2 stats(coef se pval) using "$dir_data/14b.doc", /// -replace title("Process I4b: Amount of pension income. Sample: Retired individuals who were retired in the previous year.") /// -ctitle(Retired) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) +matrix i4b=get(VCE) +matrix list i4b +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I4b_amount VCE") replace +putexcel A1 = matrix(i4b), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I4b.doc", replace /// +title("Process I4b: Amount of private pension income. Sample: Individuals aged 50+ who were retired in the previous year and receive private pension income.") /// + ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) + + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p +gen sigma = e(rmse) + +save "$dir_validation_data/I4b_level_sample", replace + +scalar r2 = e(r2) +scalar N = e(N) +scalar rmse= e(rmse) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I4b_amount") modify +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I4b_amount") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low /// + Dhhtp_c4_CoupleChildren_L1 Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 /// + Dhe_pcs_L1 Dhe_mcs_L1 Ypnoab_L1 Ypnoab_L2 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A8 = ("I4b") B8 = rmse + + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A26 = /// + "I4b level - Receiving private pension income: was retired last year", /// + bold + +putexcel A27 = "R-squared" +putexcel B27 = r2 +putexcel A28 = "N" +putexcel B28 = N + +drop in_sample p sigma +scalar drop r2 N - -/********************************************************************** -PRIVATE PENSION INCOME VERSION 2: - -selection equation for recipiency of private pension income - -followed by level of private pension income using linear model -***********************************************************************/ ************************************************************************** -*Process I5a: Probability of receiving private pension income. +*I5a: Probability of receiving private pension income. ************************************************************************** *Sample: Retired individuals who were not retired in the previous year. +* DV: Receiving private pension income dummy /* Estimated on a sample of individuals retired at time t, who were not retired at t-1. I.e. this is probability of receiving private pension income upon retirement. */ -logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 l.dhe l.pred_hourly_wage ib8.drgn1 c.growth stm /// -if scedsmpl==0 & dag >= 50 & dlrtrd == 1 & l.les_c3 != 2 & l.les_c3 != 4 [pweight=dimxwt], vce(cluster idperson) base - +logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 /// +l.pred_hourly_wage ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot /// +if dag >= 50 & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 [pweight=dimxwt], /// +vce(cluster idperson) base + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I5a Select") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I5a_selection E") replace putexcel A1 = matrix(results), names nformat(number_d2) +matrix i5a=get(VCE) +matrix list i5a +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I5a_selection VCE") replace +putexcel A1 = matrix(i5a), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I5a_sel.doc", replace /// +title("Process I5a selection: Probability of receiving capital income. Sample: Individuals aged 50+ who were not retired last year.") /// +ctitle(Probability receiving capital income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +//cap drop in_sample +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/I5a_selection_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I5a_selection") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I5a_selection") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling +// Need to variable label when add new variable to model. Order matters. + +local var_list Dgn StatePensionAge Deh_c3_Medium Deh_c3_Low /// + Les_c4_NotEmployed_L1 /// + Dhhtp_c4_CoupleChildren_L1 Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 /// + Dhe_pcs_L1 Dhe_mcs_L1 Hourly_wage_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A30 = /// + "I5a selection - Receiving private pension income: was not retited last year", /// + bold + +putexcel A32 = "Pseudo R-squared" +putexcel B32 = r2_p +putexcel A33 = "N" +putexcel B33 = N +putexcel E32 = "Chi^2" +putexcel F32 = chi2 +putexcel E33 = "Log likelihood" +putexcel F33 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I5a Select") modify -putexcel A1 = matrix(i1a), names -outreg2 stats(coef se pval) using "$dir_data/I5a.doc", replace /// -title("Process I5a: Probability of receiving private pension income. Sample: Retired individuals who were not retired in the previous year.") /// -ctitle(Probability of private pension income) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) **************************************************** -*Process I5b: Amount of private pension income. +*I5a: Amount of private pension income. **************************************************** *Sample: Retired individuals who were not retired in the previous year and receive private pension income. -regress ypnoab_lvl i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 l.dhe l.pred_hourly_wage ib8.drgn1 c.growth stm /// -if scedsmpl==0 & dag >= 50 & dlrtrd == 1 & l.les_c3 != 2 & l.les_c3 != 4 & receives_ypnoab [pweight=dimxwt], vce(cluster idperson) base - -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I5b Amount") modify -putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I5b Amount") modify -putexcel A1 = matrix(i1a), names -outreg2 stats(coef se pval) using "$dir_data/I5b.doc", replace /// -title("Process I5b: Amount of private pension income. Sample: Retired individuals who were not retired in the previous year and receive private pension income.") /// -ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) - -capture log close - -/* -******************** -*I6a: selection -******************** -/* - -Processes I6a and I6b are used to estimate private pension income among those continue retirement (retired at t and at t-1), -*and have not received private pension income in the previous year - -Estimated on a sample of individuals retired at time t -I.e. this is probability of receiving private pension in retirement, if not received private pension income in the initial population data -*/ - -logit receives_ypnoab i.dgn i.state_pension_age ib1.deh_c3 lib4.les_c3 lib1.dhhtp_c4 cl.ypncp l.dhe ib8.drgn1 c.growth stm /// -if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 & l.receives_ypnoab == 0 [pweight=dimxwt], vce(cluster idperson) base - -matrix results = r(table) -matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I6a Select") modify -putexcel A1 = matrix(results), names nformat(number_d2) - -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I6a Select") modify -putexcel A1 = matrix(i1a), names - -*********************************************************************************** -*I6b: amount of private pension income for those receiving private pension income -*********************************************************************************** - -regress ypnoab_lvl i.dgn i.state_pension_age ib1.deh_c3 lib1.dhhtp_c4 l.dhe ib8.drgn1 cl.ypncp c.growth stm /// -if dag >= 50 & les_c3 == 4 & l.les_c3 == 4 & l.receives_ypnoab == 0 & receives_ypnoab == 1 [pweight=dimxwt], vce(cluster idperson) base +regress ln_ypnoab i.dgn dag dagsq /*i.state_pension_age*/ ib1.deh_c3 li.les_c4 lib1.dhhtp_c4 /*l.dhe*/ dhe_pcs_L1 dhe_mcs_L1 /// +l.pred_hourly_wage ib8.drgn1 stm /*c.growth*/ y2020 y2021 i.dot /// +if dag >= 50 & dlrtrd == 1 & l.dlrtrd!=1 & l.les_c4 != 2 & receives_ypnoab [pweight=dimxwt], /// +vce(cluster idperson) base +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/uk_income_split", sheet("Process I6b Amount") modify +putexcel set "$dir_raw_results/income/income_split", sheet("Process I5a_amount E") replace putexcel A1 = matrix(results), names nformat(number_d2) +matrix i5a=get(VCE) +matrix list i5a +putexcel set "$dir_raw_results/income/income_split_vcm", sheet("Process I5a_amount VCE") replace +putexcel A1 = matrix(i5a), names +outreg2 stats(coef se pval) using "$dir_raw_results/income/I5a.doc", replace /// +title("Process I5a: Amount of private pension income. Sample: Individuals aged 50+ who were not retired in the previous year and receive private pension income.") /// + ctitle(Amount of private pension income) label side dec(2) noparen addstat(R2, e(r2), RMSE, e(rmse)) + + +* Save sample inclusion indicator and predicted probabilities +gen in_sample = e(sample) +predict p +gen sigma = e(rmse) + +save "$dir_validation_data/I5a_level_sample", replace + +scalar r2 = e(r2) +scalar N = e(N) +scalar rmse= e(rmse) + +* Results +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/income/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/income/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_income", sheet("UK_I5a_amount") modify +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_income", sheet("UK_I5a_amount") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low /// + Les_c4_NotEmployed_L1 Dhhtp_c4_CoupleChildren_L1 Dhhtp_c4_SingleNoChildren_L1 Dhhtp_c4_SingleChildren_L1 /// + Dhe_pcs_L1 Dhe_mcs_L1 Hourly_wage_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +* Save RMSE +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A9 = ("I5a") B9 = rmse + + +* Goodness of fit +putexcel set "$dir_results/reg_income", sheet("Gof") modify + +putexcel A35 = /// + "I5a level - Receiving private pension income: was not retired last year", /// + bold + +putexcel A37 = "R-squared" +putexcel B37 = r2 +putexcel A38 = "N" +putexcel B38 = N + +drop in_sample p sigma +scalar drop r2 N + + +//end -matrix i1a=get(VCE) -matrix list i1a -putexcel set "$dir_data/uk_income_split_vcm", sheet("Process I6b Amount") modify -putexcel A1 = matrix(i1a), names - -*/ - +capture log close +graph drop _all diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do index 6bbabdde1..3e0a797e4 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do @@ -1,10 +1,14 @@ ******************************************************************************** -* PROJECT: INAPP +* PROJECT: ESPON * SECTION: Leaving Parental Home -* OBJECT: Final Probit and Linear Regression Models - Weighted +* OBJECT: Final Probit Regression Model * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) -******************************************************************************** +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +* NOTES: +********************************************************************************** + clear all set more off set mem 200m @@ -13,25 +17,6 @@ set type double set maxvar 30000 -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - ******************************************************************* cap log close log using "${dir_log}/reg_leaveParentalHome.log", replace @@ -39,74 +24,237 @@ log using "${dir_log}/reg_leaveParentalHome.log", replace use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -/*DP: note that the categories in les_c4 used by Cara are different from the ones currently used -so the categories in the corresponsing Excel file were updated */ - -*Labeling and formating variables - -label define jbg 1 "Employed" 2 "Student" 3 "Not employed" 4 "Retired" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable dhhtp_c4 "Household Type: 4 Category" -label variable stm "Year" -label variable les_c4 "Employment Status: 4 Category" -label variable dhe "Self-rated Health" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable ydses_c5 "Annual Household Income Quintile" -label variable dlltsd "Long-term Sick or Disabled" - -label value dgn gdr -label value drgn1 rgna -label value dhhtp_c4 hht -label value les_c4 jbg -label value deh_c3 edd -label value ded yn - +do "$dir_do/variable_update" +* sample selection drop if dag < 16 -replace stm = stm - 2000 + + +xtset idperson swv -/*check if all covariates are available in the data*/ -recode dlftphm dgn dag dagsq deh_c3 les_c4 les_c3 ydses_c5 drgn1 stm (-9=.) +* Set Excel file -xtset idperson swv +* Info sheet + +putexcel set "$dir_work/reg_leave_parental_home", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing leaving parental home" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "P1a" +putexcel B5 = "Probit regression estimates for leaving the parental home - 18+, not in intitial education spell, living with parents in t-1" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Added: ethnicity-4 cat (dot); covid dummies (y2020 y2021); not partnered condition (dcpst != 1) to be consistent with the simulation" + +putexcel set "$dir_work/reg_leave_parental_home", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold ************************************ -*Process LPH1: Leave Parental Home * +* Process P1a: Leave Parental Home * ************************************ -*Process P1a: Probability of leaving the parental home. Sample: All non-student respondents living with a parent. -*Or Probability of leaving the parental home for those who have left education. (Students stay in the parental home). -*sample: All non-student respondents aged 18+ who lived with a parent at t-1 -fre dlftphm if (ded==0 & dag>=18 & l.dlftphm==0) +* Process P1a: Probability of leaving the parental home. +* Sample: All respondents living with a parent in t-1, aged 18+, not in initial +* education spell +* DV: Left parental home dummy of those who lived with parents in t-1 +* Note: Added not partnered condition as well to be consistent with the simulation +fre dlftphm if (ded == 0 & dag >= 18 & dcpst != 1) //3.65% + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// + if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(P1a, dimlwt) side dec(4) + +probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// + if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(P1a, disclwt) side dec(4) + +probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// + if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_P1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(P1a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_P1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ -probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm if (ded==0 & dag>=18 & l.dlftphm==0) [pweight=disclwt], vce(robust) +probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// + if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimxwt], vce(robust) + + + * save raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/leave_parent_home", sheet("Process P1a male grads") replace +putexcel set "$dir_raw_results/leave_parental_home/leave_parental_home", sheet("Process P1a") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/P1a.doc", replace /// -title("Process P1a: Probability of leaving the parental home. Sample: All non-student respondents living with a parent.") /// - ctitle(Leave parental home) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +outreg2 stats(coef se pval) using "$dir_raw_results/leave_parental_home/P1a.doc", replace /// +title("Process P1a: Probability of leaving the parental home. Sample: All respondents living with a parent and not in initial education spell.") /// + ctitle(Leave parental home) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) + + +predict p + +save "$dir_validation_data/P1a_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/leave_parental_home/var_cov", sheet("var_cov") /// + replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/leave_parental_home/var_cov", sheet("var_cov") /// + clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_leave_parental_home", sheet("UK_P1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_leave_parental_home", sheet("UK_P1a") modify +putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) + + +* Labeling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Deh_c3_Medium" +putexcel A6 = "Deh_c3_Low" +putexcel A7 = "Les_c3_Student_L1" +putexcel A8 = "Les_c3_NotEmployed_L1" +putexcel A9 = "Ydses_c5_Q2_L1" +putexcel A10 = "Ydses_c5_Q3_L1" +putexcel A11 = "Ydses_c5_Q4_L1" +putexcel A12 = "Ydses_c5_Q5_L1" +putexcel A13 = "UKC" +putexcel A14 = "UKD" +putexcel A15 = "UKE" +putexcel A16 = "UKF" +putexcel A17 = "UKG" +putexcel A18 = "UKH" +putexcel A19 = "UKJ" +putexcel A20 = "UKK" +putexcel A21 = "UKL" +putexcel A22 = "UKM" +putexcel A23 = "UKN" +putexcel A24 = "Year_transformed" +putexcel A25 = "Y2020" +putexcel A26 = "Y2021" +putexcel A27 = "Ethn_Asian" +putexcel A28 = "Ethn_Black" +putexcel A29 = "Ethn_Other" +putexcel A30 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Deh_c3_Medium" +putexcel G1 = "Deh_c3_Low" +putexcel H1 = "Les_c3_Student_L1" +putexcel I1 = "Les_c3_NotEmployed_L1" +putexcel J1 = "Ydses_c5_Q2_L1" +putexcel K1 = "Ydses_c5_Q3_L1" +putexcel L1 = "Ydses_c5_Q4_L1" +putexcel M1 = "Ydses_c5_Q5_L1" +putexcel N1 = "UKC" +putexcel O1 = "UKD" +putexcel P1 = "UKE" +putexcel Q1 = "UKF" +putexcel R1 = "UKG" +putexcel S1 = "UKH" +putexcel T1 = "UKJ" +putexcel U1 = "UKK" +putexcel V1 = "UKL" +putexcel W1 = "UKM" +putexcel X1 = "UKN" +putexcel Y1 = "Year_transformed" +putexcel Z1 = "Y2020" +putexcel AA1 = "Y2021" +putexcel AB1 = "Ethn_Asian" +putexcel AC1 = "Ethn_Black" +putexcel AD1 = "Ethn_Other" +putexcel AE1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_leave_parental_home", sheet("Gof") modify + +putexcel A3 = "P1a - Leaving parental home", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll +drop in_sample p +scalar drop r2_p N chi2 ll capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do index 55b7dbece..74577e496 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_partnership.do @@ -3,7 +3,13 @@ * SECTION: Unions * OBJECT: Final Probit Models * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +*NOTES: +* +* Reduced number of covariates in union formation process +* for those in initial education spell to obtain estimaes. ******************************************************************************** clear all set more off @@ -13,25 +19,6 @@ set type double set maxvar 30000 -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - ******************************************************************* cap log close log using "${dir_log}/reg_partnership.log", replace @@ -39,117 +26,649 @@ log using "${dir_log}/reg_partnership.log", replace use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -cap gen ypnbihs_dv_sq = ypnbihs_dv^2 - -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label define dces 1 "Both Employed" 2 "Employed, Spouse Not Employed" 3 "Not Employed, Spouse Employed" 4 "Both Not Employed" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable dhe "Self-rated Health" -label variable dcpen "Entered a new Partnership" -label variable dcpex "Partnership dissolution" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable dnc "Number of Children in Household" -label variable dnc02 "Number of Children aged 0-2 in Household" -label variable ydses_c5 "Gross Annual Household Income Quintile" -label variable lesdf_c4 "Differential Employment Status" -label variable ypnbihs_dv "Personal Non-benefit Gross Income" -label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared" -label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income" -label variable dhhtp_c4 "Household Type: 4 Category" - -label value dgn gdr -label value drgn1 rgna -label value les_c3 lessp_c3 jbf -label value deh_c3 dehsp_c3 edd -label value dcpen dcpex yn -label value lesdf_c4 dces -label value dhhtp_c4 hht +do "$dir_do/variable_update" + + +*sample selection drop if dag < 16 -replace stm = stm - 2000 -/*check if all covariates are available in the data*/ -recode dcpen dgn dag dagsq ydses_c5 dnc dnc02 dhe deh_c3 dehsp_c3 les_c3 /// -ypnbihs_dv ypnbihs_dv_sq dnc dnc02 dhe dhesp ynbcpdf_dv dcpyy dcpagdf dhhtp_c4 lesdf_c4 /// -drgn1 stm (-9=. ) xtset idperson swv +* Set Excel file + +* Info sheet + +putexcel set "$dir_results/reg_partnership", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters for relationship status projection" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "U1a" +putexcel B5 = "Probit regression estimates probability of entering a partnership - single respondents aged 18+ in initial education spell" +putexcel A6 = "U1b" +putexcel B6 = "Probit regression estimates of probability of entering a partnership - single respondents aged 18+ not in initial education spell" +putexcel A7 = "U2b" +putexcel B7 = "Probit regression estimates of probability of exiting a partnership - cohabiting women aged 18+ not in initial education spell" + +putexcel A10 = "Notes:", bold +putexcel B10 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot) and Covid dummies (y2020 y2021)" +putexcel B11 = "U1a: Just 73 obs with positive outcome! Cannot include region and covid dummies as covariates. Cannot obtain estimates of the 5th quintile of hh income" +putexcel B12 = "U2b contains a new variable New_rel_L1" + +putexcel set "$dir_results/reg_partnership", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold -*************************************************************** -*Process U1a: Entering a partnership - In continuous education * -*************************************************************** -*Probability of entering a partnership. -*Sample: All single respondents aged 18 and older, in continuous education. -fre dcpen if (dag>=18 & ded==1 & ssscp!=1) //exclude same sex couples +**************************************************** +* U1a: Partnership formation, in initial edu spell * +**************************************************** +* Probability of entering a partnership. +* Sample: All single respondents aged 18 +, in continuous education. +* DV: Enter partnership dummy +* Note: Requirement of being single in the previous year is embedded in the +* dependent variable +* Only 73 observation of relationships forming when still in initial +* education spell and aged 18+. + +fre dcpen if (dag >= 18 & ded == 1 & ssscp != 1) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U1a, dimlwt) side dec(4) + +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag>=18 & ded==1 & ssscp!=1) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1a, disclwt) side dec(4) + +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_U1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ -probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 i.dhe ib8.drgn1 stm if (dag>=16 & ded==1 & ssscp!=1) [pweight=disclwt], vce(robust) +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag>=18 & ded==1 & ssscp!=1) [pweight=dimxwt], vce(robust) + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/union", sheet("Process U1a") replace +putexcel set "$dir_raw_results/partnership/partnership", sheet("U1a") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace /// -title("Process U1a: Probit regression estimates for entering a partnership - single respondents aged 16+ in continuous education") /// +outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U1a.doc", replace /// +title("Process U1a: Probit regression estimates for entering a partnership - single respondents aged 18+ in continuous education") /// ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/U1a_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_partnership", sheet("UK_U1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_partnership", sheet("UK_U1a") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Ydses_c5_Q2_L1" +putexcel A6 = "Ydses_c5_Q3_L1" +putexcel A7 = "Ydses_c5_Q4_L1" +putexcel A8 = "Dnc_L1" +putexcel A9 = "Dnc02_L1" +putexcel A10 = "Dhe_pcs" +putexcel A11 = "Dhe_mcs" +putexcel A12 = "Year_transformed" +putexcel A13 = "Ethn_Asian" +putexcel A14 = "Ethn_Black" +putexcel A15 = "Ethn_Other" +putexcel A16 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Ydses_c5_Q2_L1" +putexcel G1 = "Ydses_c5_Q3_L1" +putexcel H1 = "Ydses_c5_Q4_L1" +putexcel I1 = "Dnc_L1" +putexcel J1 = "Dnc02_L1" +putexcel K1 = "Dhe_pcs" +putexcel L1 = "Dhe_mcs" +putexcel M1 = "Year_transformed" +putexcel N1 = "Ethn_Asian" +putexcel O1 = "Ethn_Black" +putexcel P1 = "Ethn_Other" +putexcel Q1 = "Constant" -******************************************************************** -*Process U1b: Entering a partnership - Not in continuous education * -******************************************************************** -*Probability of entering a partnership. -*Sample: All respondents aged 18+ who were not in a parthership at t-1 and were not in continuous education -fre dcpen if (dag>=18 & ded==0 & ssscp!=1) //exclude same sex couples - -probit dcpen i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 l.dnc l.dnc02 i.dhe ib8.drgn1 stm if (dag>=18 & ded==0 & ssscp!=1) [pweight=disclwt], vce(robust) +* Goodness of fit + +putexcel set "$dir_results/reg_partnership", sheet("Gof") modify + +putexcel A3 = "U1a - Partnership formation, in initial education spell", /// + bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +******************************************************** +* U1b: Partnership formation, not in initial edu spell * +******************************************************** +* Process U1b: Probability of entering a partnership. +* Sample: All respondents aged 18+, left initial education spell and not in a +* same sex relationship +* DV: Enter partnership dummy (requires not having been in a relationship last +* year) +* Note: Requirement of being single in the previous year is embedded in the +* dependent variable +* Income captured by hh quintiles. + +fre dcpen if (dag >= 18 & ded == 0 & ssscp != 1) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U1b, dimlwt) side dec(4) + +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1b, disclwt) side dec(4) + +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs /*ib8.drgn1*/ stm /*y2020 y2021*/ i.dot /// +if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U1b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_U1b.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ + +probit dcpen i.dgn dag dagsq li.ydses_c5 l.dnc l.dnc02 /*dhe*/ dhe_pcs dhe_mcs ib8.drgn1 stm y2020 y2021 i.dot /// +if (dag >= 18 & ded == 0 & ssscp != 1) [pweight=dimxwt], vce(robust) + +* raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/union", sheet("Process U1b") modify +putexcel set "$dir_raw_results/partnership/partnership", sheet("Process U1b") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1b.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U1b.doc", replace /// title("Process U1b: Probit regression estimates for entering a partnership - single respondents aged 18+ not in continuous education") /// ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + +gen in_sample = e(sample) +predict p -****************************************************************** -*Process 2b: Exiting a partnership - Not in continuous education * -****************************************************************** -*Probability of partnership break-up. -*Sample: Female member of a couple aged 18+ who were in a partnership at t-1 and not in a partnership at t and were not in continuous education -fre dcpex if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) //exclude same sex couples +save "$dir_validation_data/U1b_sample", replace -probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 li.dhe li.dhesp l.dcpyy l.dcpagdf l.dnc l.dnc02 lib1.dhhtp_c4 lib1.lesdf_c4 /// -l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dhhwt], vce(robust) +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_partnership", sheet("UK_U1b") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_partnership", sheet("UK_U1b") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Ydses_c5_Q2_L1" +putexcel A6 = "Ydses_c5_Q3_L1" +putexcel A7 = "Ydses_c5_Q4_L1" +putexcel A8 = "Ydses_c5_Q5_L1" +putexcel A9 = "Dnc_L1" +putexcel A10 = "Dnc02_L1" +putexcel A11 = "Dhe_pcs" +putexcel A12 = "Dhe_mcs" +putexcel A13 = "UKC" +putexcel A14 = "UKD" +putexcel A15 = "UKE" +putexcel A16 = "UKF" +putexcel A17 = "UKG" +putexcel A18 = "UKH" +putexcel A19 = "UKJ" +putexcel A20 = "UKK" +putexcel A21 = "UKL" +putexcel A22 = "UKM" +putexcel A23 = "UKN" +putexcel A24 = "Year_transformed" +putexcel A25 = "Y2020" +putexcel A26 = "Y2021" +putexcel A27 = "Ethn_Asian" +putexcel A28 = "Ethn_Black" +putexcel A29 = "Ethn_Other" +putexcel A30 = "Constant" + +putexcel B1 = "Dgn" +putexcel C1 = "Dag" +putexcel D1 = "Dag_sq" +putexcel E1 = "Ydses_c5_Q2_L1" +putexcel F1 = "Ydses_c5_Q3_L1" +putexcel G1 = "Ydses_c5_Q4_L1" +putexcel H1 = "Ydses_c5_Q5_L1" +putexcel I1 = "Dnc_L1" +putexcel J1 = "Dnc02_L1" +putexcel K1 = "Dhe_pcs" +putexcel L1 = "Dhe_mcs" +putexcel M1 = "UKC" +putexcel N1 = "UKD" +putexcel O1 = "UKE" +putexcel P1 = "UKF" +putexcel Q1 = "UKG" +putexcel R1 = "UKH" +putexcel S1 = "UKJ" +putexcel T1 = "UKK" +putexcel U1 = "UKL" +putexcel V1 = "UKM" +putexcel W1 = "UKN" +putexcel X1 = "Year_transformed" +putexcel Y1 = "Y2020" +putexcel Z1 = "Y2021" +putexcel AA1 = "Ethn_Asian" +putexcel AB1 = "Ethn_Black" +putexcel AC1 = "Ethn_Other" +putexcel AD1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_partnership", sheet("Gof") modify + +putexcel A9 = "U1b - Partnership formation, left initial education spell", /// + bold + +putexcel A11 = "Pseudo R-squared" +putexcel B11 = r2_p +putexcel A12 = "N" +putexcel B12 = N +putexcel E11 = "Chi^2" +putexcel F11 = chi2 +putexcel E12 = "Log likelihood" +putexcel F12 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + +********************************************************** +* U2b: Partnership termination, not in initial edu spell * +********************************************************** + +* Process U2b: Probability of partnership break-up. +* Sample: Female member of a heterosexual couple in t-1 aged 18+ and not in +* continuous education +* DV: Exit partnership dummy +* Note: Requirement to be in a relationship last year is embedded in the DV. +* The ded condition refers to the female partner only. +* If take away the ded condition doesn't make any difference because there +* are not splits by those in their initial education spell. + +fre dcpex if (dgn == 0 & dag >= 18 & ded == 0 & ssscp != 1) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02 lib1.lesdf_c4 /// + l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot /// + if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(U2b, dimlwt) side dec(4) + +probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02 lib1.lesdf_c4 /// + l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot /// + if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, disclwt) side dec(4) + +probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02 lib1.lesdf_c4 /// + l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot /// + if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dhhwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, dhhwt) side dec(4) +probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02 lib1.lesdf_c4 /// + l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot /// + if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_U2b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(U2b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_U2b.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ +probit dcpex dag dagsq lib1.deh_c3 lib1.dehsp_c3 /*li.dhe li.dhesp*/ l.dhe_pcs l.dhe_mcs l.dhe_pcssp l.dhe_mcssp l.dcpyy l.new_rel l.dcpagdf l.dnc l.dnc02 lib1.lesdf_c4 /// + l.ypnbihs_dv l.ynbcpdf_dv ib8.drgn1 stm y2020 y2021 i.dot /// + if (dgn==0 & dag>=18 & ded==0 & ssscp!=1) [pweight=dimxwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/union", sheet("Process U2b") modify +putexcel set "$dir_raw_results/partnership/partnership", sheet("Process U2b") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U2b.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/partnership/U2b.doc", replace /// title("Process U2b: Probit regression estimates for exiting a partnership - cohabiting women aged 18+ not in continuous education") /// ctitle(enter partnership) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/U2b_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + +matrix list V + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/partnership/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/partnership/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_partnership", sheet("UK_U2b") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_partnership", sheet("UK_U2b") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dag" +putexcel A3 = "Dag_sq" +putexcel A4 = "Deh_c3_Medium_L1" +putexcel A5 = "Deh_c3_Low_L1" +putexcel A6 = "Dehsp_c3_Medium_L1" +putexcel A7 = "Dehsp_c3_Low_L1" +putexcel A8 = "Dhe_pcs" +putexcel A9 = "Dhe_mcs" +putexcel A10 = "Dhe_pcssp" +putexcel A11 = "Dhe_mcssp" +putexcel A12 = "Dcpyy_L1" +putexcel A13 = "New_rel_L1" +putexcel A14 = "Dcpagdf_L1" +putexcel A15 = "Dnc_L1" +putexcel A16 = "Dnc02_L1" +putexcel A17 = "Lesdf_c4_EmployedSpouseNotEmployed_L1" +putexcel A18 = "Lesdf_c4_NotEmployedSpouseEmployed_L1" +putexcel A19 = "Lesdf_c4_BothNotEmployed_L1" +putexcel A20 = "Ypnbihs_dv_L1" +putexcel A21 = "Ynbcpdf_dv_L1" +putexcel A22 = "UKC" +putexcel A23 = "UKD" +putexcel A24 = "UKE" +putexcel A25 = "UKF" +putexcel A26 = "UKG" +putexcel A27 = "UKH" +putexcel A28 = "UKJ" +putexcel A29 = "UKK" +putexcel A30 = "UKL" +putexcel A31 = "UKM" +putexcel A32 = "UKN" +putexcel A33 = "Year_transformed" +putexcel A34 = "Y2020" +putexcel A35 = "Y2021" +putexcel A36 = "Ethn_Asian" +putexcel A37 = "Ethn_Black" +putexcel A38 = "Ethn_Other" +putexcel A39 = "Constant" + + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dag" +putexcel D1 = "Dag_sq" +putexcel E1 = "Deh_c3_Medium_L1" +putexcel F1 = "Deh_c3_Low_L1" +putexcel G1 = "Dehsp_c3_Medium_L1" +putexcel H1 = "Dehsp_c3_Low_L1" +putexcel I1 = "Dhe_pcs" +putexcel J1 = "Dhe_mcs" +putexcel K1 = "Dhe_pcssp" +putexcel L1 = "Dhe_mcssp" +putexcel M1 = "Dcpyy_L1" +putexcel N1 = "New_rel_L1" +putexcel O1 = "Dcpagdf_L1" +putexcel P1 = "Dnc_L1" +putexcel Q1 = "Dnc02_L1" +putexcel R1 = "Lesdf_c4_EmployedSpouseNotEmployed_L1" +putexcel S1 = "Lesdf_c4_NotEmployedSpouseEmployed_L1" +putexcel T1 = "Lesdf_c4_BothNotEmployed_L1" +putexcel U1 = "Ypnbihs_dv_L1" +putexcel V1 = "Ynbcpdf_dv_L1" +putexcel W1 = "UKC" +putexcel X1 = "UKD" +putexcel Y1 = "UKE" +putexcel Z1 = "UKF" +putexcel AA1 = "UKG" +putexcel AB1 = "UKH" +putexcel AC1 = "UKJ" +putexcel AD1 = "UKK" +putexcel AE1 = "UKL" +putexcel AF1 = "UKM" +putexcel AG1 = "UKN" +putexcel AH1 = "Year_transformed" +putexcel AI1 = "Y2020" +putexcel AJ1 = "Y2021" +putexcel AK1 = "Ethn_Asian" +putexcel AL1 = "Ethn_Black" +putexcel AM1 = "Ethn_Other" +putexcel AN1 = "Constant" + +* Goodness of fit + +putexcel set "$dir_results/reg_partnership", sheet("Gof") modify + +putexcel A15 = /// + "U2b - Partnership termination, left initial education spell", bold +putexcel A17 = "Pseudo R-squared" +putexcel B17 = r2_p +putexcel A18 = "N" +putexcel B18 = N +putexcel E17 = "Chi^2" +putexcel F17 = chi2 +putexcel E18 = "Log likelihood" +putexcel F18 = ll +drop in_sample p +scalar drop r2_p N chi2 ll + + capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do b/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do index d0cdecf27..f811e6b2a 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_retirement.do @@ -3,7 +3,11 @@ * SECTION: Retirement * OBJECT: Final Regresion Models * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* LAST UPDATE: 1 July 2025 DP +* COUNTRY: UK +* +* NOTES: +* ******************************************************************************** clear all set more off @@ -13,25 +17,6 @@ set type double set maxvar 30000 -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - ******************************************************************* cap log close log using "${dir_log}/reg_retirement.log", replace @@ -39,79 +24,475 @@ log using "${dir_log}/reg_retirement.log", replace use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -*Labeling and formating variables -label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" - -label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" - -label define gdr 1 "Male" 0 "Female" - -label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" /// -6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" /// -12 "Scotland" 13 "Northern Ireland" - -label define yn 1 "Yes" 0 "No" - -label define hht 1 "Couples with No Children" 2 "Couples with Children" /// - 3 "Single with No Children" 4 "Single with Children" - -label variable dgn "Gender" -label variable dag "Age" -label variable dagsq "Age Squared" -label variable drgn1 "Region" -label variable stm "Year" -label variable les_c3 "Employment Status: 3 Category" -label variable dhe "Self-rated Health" -label variable deh_c3 "Educational Attainment: 3 Category" -label variable dhhtp_c4 "Household Type: 4 Category" - -label value dgn gdr -label value drgn1 rgna -label value les_c3 lessp_c3 jbf -label value deh_c3 dehsp_c3 edd -label value dcpen dcpex dlrtrd dagpns dagpns_sp yn -label value dhhtp_c4 hht +do "$dir_do/variable_update" + +* sample selection drop if dag < 16 -replace stm = stm - 2000 -*check if all covariates are available and recode missing values -recode dgn dag dagsq deh_c3 dagpns lesnr_c2 ydses_c5 dlltsd drgn1 stm dcpst drtren dagpns_sp lessp_c3 dlltsd_sp dcpst (-9=.) xtset idperson swv -******************************************* -*Process R1a: Enter Retirement - Single * -******************************************* -*Sample: Non-partnered individuals aged 50+ who are not yet retired. -probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 li.ydses_c5 li.dlltsd ib8.drgn1 stm /// -if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimlwt], vce(robust) +* Set Excel file + +* Info sheet + +putexcel set "$dir_results/reg_retirement", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection of retirement" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold + +putexcel A5 = "R1a" +putexcel B5 = "Probit regression estimates of the probability of retiring, single individuals aged 50+ not yet retired" + +putexcel A6 = "R1b" +putexcel B6 = "Probit regression estimates of the probability of retiring, cohabiting individuals aged 50+ not yet retired" + +putexcel A10 = "Notes:", bold +putexcel B10 = "replaced dlltsd with dlltsd01; added dhe_pcs and dhe_mcs, ethnicity-4 cat(dot) and Covid dummies (y2020 y2021)" + +putexcel set "$dir_results/reg_retirement", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +**************************** +* R1a: Retirement - Single * +**************************** + +* Process R1a: Probability retire if single +* Sample: Non-partnered individuals aged 50+ who are not yet retired. +* DV: Enter retirement dummy (have to not be retired last year) + +fre drtren if ((dcpst==2 | dcpst==3) & dag>=50) + +/*///////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot /// + if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(R1a, dimlwt) side dec(4) + +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot /// + if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1a, disclwt) side dec(4) + +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + li.ydses_c5 li.dlltsd ib8.drgn1 stm y2020 y2021 i.dot /// + if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1a.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1a, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_R1a.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + li.ydses_c5 li.dlltsd01 l.dhe_pcs l.dhe_mcs /// + ib8.drgn1 stm y2020 y2021 i.dot /// + if ((dcpst==2 | dcpst==3) & dag>=50) [pweight=dimxwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/retire", sheet("Process R1a") replace +putexcel set "$dir_raw_results/retirement/retirement", sheet("Process R1a") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/R1a.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/retirement/R1a.doc", replace /// title("Process R1a: Probit regression estimates for retiring - single individuals aged 50+ not yet retired") /// ctitle(retiring) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) +gen in_sample = e(sample) +predict p + +save "$dir_validation_data/R1a_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Rresults +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/retirement/var_cov", sheet("var_cov") /// + replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/retirement/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_retirement", sheet("UK_R1a") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_retirement", sheet("UK_R1a") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) -********************************************* -*Process R1b: Enter Retirement - Partnered * -********************************************* -*Sample: Partnered individuals aged 50+ who are not yet retired. -probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp li.lessp_c3 li.dlltsd_sp /// -ib8.drgn1 stm if (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimlwt], vce(robust) + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Deh_c3_Medium" +putexcel A6 = "Deh_c3_Low" +putexcel A7 = "Reached_Retirement_Age" +putexcel A8 = "Lesnr_c2_NotEmployed_L1" +putexcel A9 = "Ydses_c5_Q2_L1" +putexcel A10 = "Ydses_c5_Q3_L1" +putexcel A11 = "Ydses_c5_Q4_L1" +putexcel A12 = "Ydses_c5_Q5_L1" +putexcel A13 = "Dlltsd01_L1" +putexcel A14 = "Dhe_pcs_L1" +putexcel A15 = "Dhe_mcs_L1" +putexcel A16 = "UKC" +putexcel A17 = "UKD" +putexcel A18 = "UKE" +putexcel A19 = "UKF" +putexcel A20 = "UKG" +putexcel A21 = "UKH" +putexcel A22 = "UKJ" +putexcel A23 = "UKK" +putexcel A24 = "UKL" +putexcel A25 = "UKM" +putexcel A26 = "UKN" +putexcel A27 = "Year_transformed" +putexcel A28 = "Y2020" +putexcel A29 = "Y2021" +putexcel A30 = "Ethn_Asian" +putexcel A31 = "Ethn_Black" +putexcel A32 = "Ethn_Other" +putexcel A33 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Deh_c3_Medium" +putexcel G1 = "Deh_c3_Low" +putexcel H1 = "Reached_Retirement_Age" +putexcel I1 = "Lesnr_c2_NotEmployed_L1" +putexcel J1 = "Ydses_c5_Q2_L1" +putexcel K1 = "Ydses_c5_Q3_L1" +putexcel L1 = "Ydses_c5_Q4_L1" +putexcel M1 = "Ydses_c5_Q5_L1" +putexcel N1 = "Dlltsd01_L1" +putexcel O1 = "Dhe_pcs_L1" +putexcel P1 = "Dhe_mcs_L1" +putexcel Q1 = "UKC" +putexcel R1 = "UKD" +putexcel S1 = "UKE" +putexcel T1 = "UKF" +putexcel U1 = "UKG" +putexcel V1 = "UKH" +putexcel W1 = "UKJ" +putexcel X1 = "UKK" +putexcel Y1 = "UKL" +putexcel Z1 = "UKM" +putexcel AA1 = "UKN" +putexcel AB1 = "Year_transformed" +putexcel AC1 = "Y2020" +putexcel AD1 = "Y2021" +putexcel AE1 = "Ethn_Asian" +putexcel AF1 = "Ethn_Black" +putexcel AG1 = "Ethn_Other" +putexcel AH1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_retirement", sheet("Gof") modify + +putexcel A3 = "R1a - Retirement single", bold + +putexcel A5 = "Pseudo R-squared" +putexcel B5 = r2_p +putexcel A6 = "N" +putexcel B6 = N +putexcel E5 = "Chi^2" +putexcel F5 = chi2 +putexcel E6 = "Log likelihood" +putexcel F6 = ll + +drop in_sample p +scalar drop r2_p N chi2 ll + + + + +****************************** +* R1b: Retirement, partnered * +****************************** + +* Process R1b: Probability retire +* Sample: Partnered heterosexual individuals aged 50+ who are not yet retired +* DV: Enter retirement dummy (have to not be retired last year) +count if (ssscp!=1 & dcpst==1 & dag>=50) & lessp_c3==2 //115 obs partnered with students +drop if (ssscp!=1 & dcpst==1 & dag>=50) & lessp_c3==2 //drop partnered with students + +fre drtren if (ssscp!=1 & dcpst==1 & dag>=50) + +/*////////////////////////////////////////////////////////////////////////////////////////////////// +//check weights ////////////////////////////////////////////////////////////////////////////////// +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp /// + li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm y2020 y2021 i.dot if /// + (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimlwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) replace ctitle(R1b, dimlwt) side dec(4) + +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp /// + li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm y2020 y2021 i.dot if /// + (ssscp!=1 & dcpst==1 & dag>=50) [pweight=disclwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1b, disclwt) side dec(4) + +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd i.dagpns_sp /// + li.lessp_c3 li.dlltsd_sp ib8.drgn1 stm y2020 y2021 i.dot if /// + (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimxwt], vce(robust) +outreg2 using "${weight_checks}/weight_comparison_R1b.xls", alpha(0.001, 0.01, 0.05, 0.1) symbol(***, **, *, +) append ctitle(R1b, dimxwt) side dec(4) +erase "${weight_checks}/weight_comparison_R1b.txt" +//////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////// +*/ + +probit drtren i.dgn dag dagsq ib1.deh_c3 i.dagpns li.lesnr_c2 /// + i.dagpns#li.lesnr_c2 li.ydses_c5 li.dlltsd01 l.dhe_pcs l.dhe_mcs i.dagpns_sp /// + li.lessp_c3 li.dlltsd01_sp ib8.drgn1 stm y2020 y2021 i.dot if /// + (ssscp!=1 & dcpst==1 & dag>=50) [pweight=dimxwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/retire", sheet("Process R1b") modify +putexcel set "$dir_raw_results/retirement/retirement", sheet("Process R1b") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/R1b.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/retirement/R1b.doc", replace /// title("Process R1b: Probit regression estimates for retiring - cohabiting individuals aged 50+ not yet retired") /// ctitle(retiring) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) + + +gen in_sample = e(sample) + +predict p + +save "$dir_validation_data/R1b_sample", replace + +scalar r2_p = e(r2_p) +scalar N = e(N) +scalar chi2 = e(chi2) +scalar ll = e(ll) + + +* Results +* Note: Zeros values are eliminated + +matrix b = e(b) +matrix V = e(V) + + +* Store variance-covariance matrix + +preserve + +putexcel set "$dir_raw_results/retirement/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/retirement/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) +putexcel set "$dir_results/reg_retirement", sheet("UK_R1b") modify +putexcel C2 = matrix(var) + +restore + + +* Store estimated coefficients + +// Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +// Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +// Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +// Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_results/reg_retirement", sheet("UK_R1b") modify +putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) + + +* Labelling + +putexcel A1 = "REGRESSOR" +putexcel A2 = "Dgn" +putexcel A3 = "Dag" +putexcel A4 = "Dag_sq" +putexcel A5 = "Deh_c3_Medium" +putexcel A6 = "Deh_c3_Low" +putexcel A7 = "Reached_Retirement_Age" +putexcel A8 = "Lesnr_c2_NotEmployed_L1" +putexcel A9 = "Reached_Retirement_Age_Lesnr_c2_NotEmployed_L1" +putexcel A10 = "Ydses_c5_Q2_L1" +putexcel A11 = "Ydses_c5_Q3_L1" +putexcel A12 = "Ydses_c5_Q4_L1" +putexcel A13 = "Ydses_c5_Q5_L1" +putexcel A14 = "Dlltsd01_L1" +putexcel A15 = "Dhe_pcs_L1" +putexcel A16 = "Dhe_mcs_L1" +putexcel A17 = "Reached_Retirement_Age_Sp" +putexcel A18 = "Lessp_c3_NotEmployed_L1" +putexcel A19 = "Dlltsd01_sp_L1" +putexcel A20 = "UKC" +putexcel A21 = "UKD" +putexcel A22 = "UKE" +putexcel A23 = "UKF" +putexcel A24 = "UKG" +putexcel A25 = "UKH" +putexcel A26 = "UKJ" +putexcel A27 = "UKK" +putexcel A28 = "UKL" +putexcel A29 = "UKM" +putexcel A30 = "UKN" +putexcel A31 = "Year_transformed" +putexcel A32 = "Y2020" +putexcel A33 = "Y2021" +putexcel A34 = "Ethn_Asian" +putexcel A35 = "Ethn_Black" +putexcel A36 = "Ethn_Other" +putexcel A37 = "Constant" + +putexcel B1 = "COEFFICIENT" +putexcel C1 = "Dgn" +putexcel D1 = "Dag" +putexcel E1 = "Dag_sq" +putexcel F1 = "Deh_c3_Medium" +putexcel G1 = "Deh_c3_Low" +putexcel H1 = "Reached_Retirement_Age" +putexcel I1 = "Lesnr_c2_NotEmployed_L1" +putexcel J1 = "Reached_Retirement_Age_Les_c3_NotEmployed_L1" +putexcel K1 = "Ydses_c5_Q2_L1" +putexcel L1 = "Ydses_c5_Q3_L1" +putexcel M1 = "Ydses_c5_Q4_L1" +putexcel N1 = "Ydses_c5_Q5_L1" +putexcel O1 = "Dlltsd01_L1" +putexcel P1 = "Dhe_pcs_L1" +putexcel Q1 = "Dhe_mcs_L1" +putexcel R1 = "Reached_Retirement_Age_Sp" +putexcel S1 = "Lessp_c3_NotEmployed_L1" +putexcel T1 = "Dlltsd01_sp_L1" +putexcel U1 = "UKC" +putexcel V1 = "UKD" +putexcel W1 = "UKE" +putexcel X1 = "UKF" +putexcel Y1 = "UKG" +putexcel Z1 = "UKH" +putexcel AA1 = "UKJ" +putexcel AB1 = "UKK" +putexcel AC1 = "UKL" +putexcel AD1 = "UKM" +putexcel AE1 = "UKN" +putexcel AF1 = "Year_transformed" +putexcel AG1 = "Y2020" +putexcel AH1 = "Y2021" +putexcel AI1 = "Ethn_Asian" +putexcel AJ1 = "Ethn_Black" +putexcel AK1 = "Ethn_Other" +putexcel AL1 = "Constant" + + +* Goodness of fit + +putexcel set "$dir_results/reg_retirement", sheet("Gof") modify + +putexcel A9 = "R1b - Retirement partnered", bold + +putexcel A11 = "Pseudo R-squared" +putexcel B11 = r2_p +putexcel A12 = "N" +putexcel B12 = N +putexcel E11 = "Chi^2" +putexcel F11 = chi2 +putexcel E12 = "Log likelihood" +putexcel F12 = ll +drop in_sample p +scalar drop r2_p N chi2 ll capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do b/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do index 6d6928863..1ea5167c1 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_unemployment.do @@ -13,54 +13,34 @@ set type double set maxvar 30000 -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - ******************************************************************* cap log close log using "${dir_log}/reg_unemployment.log", replace ******************************************************************* -/******************************************************************************* -* START ANALYSIS -*******************************************************************************/ - - /******************************************************************************* * IMPORT UNEMPLOYMENT RATES *******************************************************************************/ -import delimited "${dir_data}/unemp_rates.csv", clear -save "${dir_data}/unemp_rates", replace +//import delimited "${dir_external_data}/unemp_rates.csv", clear +//save "${dir_external_data}/unemp_rates", replace /******************************************************************************* * LOAD WORKING DATA *******************************************************************************/ use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear + +do "$dir_do/variable_update" + keep if (dag>15 & dag<75) -// append unemployment rates to data -merge m:1 dgn deh_c3 dag stm using "${dir_data}/unemp_rates", keep(3) nogen +/* append unemployment rates to data +merge m:1 dgn deh_c3 dag stm using "${dir_external_data}/unemp_rates", keep(3) nogen label variable dukue "UK unemployment rate by age, year, gender, and graduate status" +*/ -gen unemp = (jbstat==3) +//gen unemp = (jbstat==3) label variable unemp "labour status unemployed" gen nemp = (jbstat!=1 & jbstat!=2 & jbstat!=10 & jbstat!=11) replace nemp = . if (jbstat==4 | jbstat==5 | jbstat==7 | jbstat==8 | jbstat==9 | jbstat==12 | jbstat==13 | jbstat==14) @@ -88,53 +68,61 @@ gen dc02 = (dnc02>0) * CALCULATE REGRESSION *******************************************************************************/ xtset idperson swv -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==1 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) + +probit unemp /*dukue i.dhe*/ l.dhe_mcs l.dhe_pcs l.nemp ib8.drgn1 i.dot if (dgn==1 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1a male grads") replace +putexcel set "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/unemployment", sheet("Process U1a male grads") replace putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1a.doc", replace /// +outreg2 stats(coef se pval) using "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/U1a.doc", replace /// title("Process U1a: Probability of unemployment. Sample: Men aged 18-64 with graduate education.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) - -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==1 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) + +probit unemp /*dukue i.dhe*/ l.dhe_mcs l.dhe_pcs l.nemp ib8.drgn1 i.dot if (dgn==1 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1b male ngrads") modify +putexcel set "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/unemployment", sheet("Process U1b male ngrads") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1b.doc", replace /// +outreg2 stats(coef se pval) using "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/U1b.doc", replace /// title("Process U1b: Probability of unemployment. Sample: Men aged 18-64 with non-graduate education.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==0 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) + + probit unemp /*dukue i.dhe*/ l.dhe_mcs l.dhe_pcs l.nemp ib8.drgn1 i.dot if (dgn==0 & dag>17 & dag<65 & deh_c3==1) [pweight=disclwt], vce(robust) + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1c female grads") modify +putexcel set "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/unemployment", sheet("Process U1c female grads") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1c.doc", replace /// +outreg2 stats(coef se pval) using "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/U1c.doc", replace /// title("Process U1c: Probability of unemployment. Sample: Women aged 18-64 with graduate education.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) -probit unemp dukue i.dhe l.nemp ib8.drgn1 if (dgn==0 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) -matrix results = r(table) + +probit unemp /*dukue i.dhe*/ l.dhe_mcs l.dhe_pcs l.nemp ib8.drgn1 i.dot if (dgn==0 & dag>17 & dag<65 & deh_c3>1) [pweight=disclwt], vce(robust) + + * raw results matrix results = r(table) matrix results = results[1..6,1...]' -putexcel set "$dir_data/unempoyment", sheet("Process U1d female ngrads") modify +putexcel set "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/unemployment", sheet("Process U1d female ngrads") modify putexcel A3 = matrix(results), names nformat(number_d2) putexcel J4 = matrix(e(V)) -outreg2 stats(coef se pval) using "$dir_data/U1d.doc", replace /// +outreg2 stats(coef se pval) using "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\raw_results\unemployment/U1d.doc", replace /// title("Process U1d: Probability of unemployment. Sample: Women aged 18-64 with non-graduate education.") /// ctitle(Giving birth) label side dec(2) noparen addstat(R2, e(r2_p), Chi2, e(chi2), Log-likelihood, e(ll)) -// exploratory regressions +/* exploratory regressions probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==0 & deh_c3==1 & stm>2017) probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==0 & deh_c3>1 & stm>2017) probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==1 & deh_c3==1 & stm>2017) probit unemp i.ageGroup dukue carer recare i.dhe l.unemp ib1.dcpst dnc dnc02 i.drgn1 if (dgn==1 & deh_c3>1 & stm>2017) - +*/ capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do b/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do index 64f1787d5..73dffddd5 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_wages.do @@ -2,8 +2,8 @@ * PROJECT: ESPON * SECTION: Wage regression * OBJECT: Heckman regressions -* AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 21/04/2024 (JV) +* AUTHORS: Patryk Bronka, Daria Popova, Justin van de Ven +* LAST UPDATE: 3 July 2025 DP ******************************************************************************** clear all set more off @@ -13,25 +13,6 @@ set type double set maxvar 30000 -/******************************************************************************* -* DEFINE DIRECTORIES -*******************************************************************************/ -* Working directory -global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\regression_estimates" - -* Directory which contains do files -global dir_do "${dir_work}/do" - -* Directory which contains data files -global dir_data "${dir_work}/data" - -* Directory which contains log files -global dir_log "${dir_work}/log" - -* Directory which contains pooled UKHLS dataset -global dir_ukhls_data "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations\data" - - ******************************************************************* cap log close log using "${dir_log}/reg_wages.log", replace @@ -75,45 +56,45 @@ program computePredicted end -capture program drop analyseFit +capture program drop analyseFit program analyseFit - local filter = "`1'" + // 1 = filter + // 2 = optional flag "nocorr" + // 3 = title + // 4 = suffix for filename + + local filter = "`1'" + + quietly sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter' + + if "`2'" != "nocorr" { + corr wage_hour L1.wage_hour if `filter' & previouslyWorking + corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking + } + + // Log wage graph + twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12) ) /// + (hist lwage_hour_hat if `filter', fcolor(none) lcolor(red) ), /// + xtitle("log gross hourly wages (GBP)") /// + legend(label(1 "observed") label(2 "predicted")) /// + name(log, replace) /// + title("`3'") + + graph export "${dir_validation_graphs}/wages/log_`4'.png", replace + + // Level wage graph + twoway (hist wage_hour if `filter' & wage_hour < 150, percent lcolor(gs12) fcolor(gs12) start(0) width(1)) /// + (hist wage_hour_hat if `filter' & wage_hour_hat < 150, percent fcolor(none) lcolor(red) start(0) width(1)), /// + xtitle("gross hourly wages (GBP)") /// + legend(label(1 "observed") label(2 "predicted")) /// + name(levels, replace) /// + title("`3'") + + graph export "${dir_validation_graphs}/wages/level_`4'.png", replace - sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter' - if ("`2'" != "nocorr") { - corr wage_hour L1.wage_hour if `filter' & previouslyWorking - corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking - } - - twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12)) /// - (hist lwage_hour_hat if `filter', fcolor(none) lcolor(red)), xtitle (log gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(log, replace) - - twoway (hist wage_hour if `filter' & wage_hour < 150, lcolor(gs12) fcolor(gs12)) /// - (hist wage_hour_hat if `filter' & wage_hour_hat < 150, fcolor(none) lcolor(red)), xtitle (gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(levels, replace) - end -capture program drop analyseFit2 -program analyseFit2 - - local filter = "`1'" - - sum lwage_hour lwage_hour_hat wage_hour wage_hour_hat if `filter' - if ("`2'" != "nocorr") { - corr wage_hour L1.wage_hour if `filter' & previouslyWorking - corr wage_hour_hat L1.wage_hour_hat if `filter' & previouslyWorking - } - - twoway (hist lwage_hour if `filter', lcolor(gs12) fcolor(gs12)) /// - (hist lwage_hour_hat if `filter', fcolor(none) lcolor(red)), xtitle (log gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(log, replace) title("`3'") - graph export "${dir_graphs}/log_`4'", replace - - twoway (hist wage_hour if `filter' & wage_hour < 150, lcolor(gs12) fcolor(gs12)) /// - (hist wage_hour_hat if `filter' & wage_hour_hat < 150, fcolor(none) lcolor(red)), xtitle (gross hourly wages (GBP)) legend(lab(1 "observed") lab( 2 "predicted")) name(levels, replace) title("`3'") - graph export "${dir_graphs}/level_`4'", replace - -end capture program drop outputResults program outputResults @@ -122,12 +103,12 @@ program outputResults matrix results = r(table) matrix results = results[1..6,1...]' //extract the first six rows of results, and then transpose results - putexcel set "$dir_data/`outputFile'.xlsx", sheet("Estimates") replace + putexcel set "$dir_raw_results/wages/`outputFile'.xlsx", sheet("Estimates") replace putexcel A3 = matrix(results), names nformat(number_d2) matrix results = e(V) - putexcel set "$dir_data/`outputFile'.xlsx", sheet("Varcov") modify + putexcel set "$dir_raw_results/wages/`outputFile'.xlsx", sheet("Varcov") modify putexcel A3 = matrix(results), names nformat(number_d2) end @@ -171,7 +152,7 @@ save "$work_dir/growth_rates", replace // Note: use code above if calculating real wage growth inside of the simulation, but if loading from excel use values from excel in Stata too. //They *should* be the same but it is more consistent to have one source of values. -import excel "$dir_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real wage growth rates +import excel "$dir_external_data/time_series_factor.xlsx", sheet("UK_wage_growth") firstrow clear // Import real wage growth rates rename Year stm rename Value real_wage_growth replace stm = stm - 2000 @@ -179,7 +160,7 @@ sum real_wage_growth if stm == 15 gen base = r(mean) replace real_wage_growth = real_wage_growth / base // Note: switching from 100 base to 1 base as that's what happens in the simulation when rebasing indices drop base -save "$dir_data/growth_rates", replace +save "$dir_external_data/growth_rates", replace /**************************************************************/ @@ -189,46 +170,32 @@ save "$dir_data/growth_rates", replace /**************************************************************/ use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear -drop if dag < $min_age +do "$dir_do/variable_update" + +drop if dag < $min_age + * screen data to ensure that idperson and swv uniquely identify observations sort idperson swv +duplicates report idperson swv gen chk = 0 replace chk = 1 if (idperson == idperson[_n-1] & swv == swv[_n-1]) drop if chk == 1 -* Fill in missing information on year (stm) based on wave (swv) -/* -replace stm = 2009 if swv == 1 & missing(stm) -replace stm = 2010 if swv == 2 & missing(stm) -replace stm = 2011 if swv == 3 & missing(stm) -replace stm = 2012 if swv == 4 & missing(stm) -replace stm = 2013 if swv == 5 & missing(stm) -replace stm = 2014 if swv == 6 & missing(stm) -replace stm = 2015 if swv == 7 & missing(stm) -replace stm = 2016 if swv == 8 & missing(stm) -replace stm = 2017 if swv == 9 & missing(stm) -replace stm = 2018 if swv == 10 & missing(stm) -replace stm = 2019 if swv == 11 & missing(stm) -replace stm = 2020 if swv == 12 & missing(stm) -replace stm = 2021 if swv == 13 & missing(stm) -*/ - -replace stm = stm - 2000 - /**************************************************************/ * * merge in real growth index from microsimulation's input folder * /**************************************************************/ -merge m:1 stm using "$dir_data/growth_rates", keep(3) nogen keepusing(real_wage_growth) +merge m:1 stm using "$dir_external_data/growth_rates", keep(3) nogen keepusing(real_wage_growth) //rename drgnl drgn1 // Rename region variable to drgn1 (one, not "l") *Variable stm identifies time periods. Need to ensure that combining idperson and stm ensures uniqueness. +duplicates report idperson stm duplicates tag idperson stm, gen(dup) -sort idperson stm -//DP: no such cases // +sort idperson stm +/*DP: no duplicates in terms of idperson and stm therefore the code below in no longer needed *However, this affects many variables: idhh, dag, ddt, dpd, ddt01, potentially idpartner. Might be best to move entire household. *Furthermore, the duplicated observation can occur in a year for which y-1 and y+1 have been observed. @@ -250,19 +217,16 @@ gen count_year = stm - min_observed_year sort idperson stm swv // Sort interview date in ascending order - earliest interview will be the one with the gap_prev set to 1 by idperson: gen gap_prev = (((count_year - count_year[_n-1]) > 1) & count_year>0) // There is a gap in year -1 by idperson: replace gap_prev = 1 if _n == 1 & dup == 1 & stm > 2009 -//DP: 0 cases gsort +idperson -stm -swv // Sort years in reverse order. Sort int date in descending order - later interview will be the one with gap_next set to 1 by idperson: gen gap_next = (((count_year - count_year[_n-1]) < -1) & stm != 2018) // There is a gap in year +1 sort idperson stm swv by idperson: replace gap_next = 1 if _n == _N & dup == 1 -//DP: 1,547 real changes made by idperson: replace gap_prev = 0 if gap_next[_n-1] == 1 & dup[_n-1] == 1 // If previous observation already has flag set to move to next period, can't move another one to the same period -//DP: 3,193 real changes made + *Check if whole household is duplicated bys idhh swv: egen min_dup = min(dup) // If == 1, then every observation for that household is duplicated -// 18480 cases *Check if whole household can be moved either back or forward: bys idhh stm: egen hh_gap_prev = min(gap_prev) @@ -270,12 +234,12 @@ bys idhh stm: egen hh_gap_next = min(gap_next) *Generate identifier for the whole household which should be moved: move the observation from the wave which is closer to the gap gen move = 1 if dup == 1 & (hh_gap_prev == 1 | hh_gap_next == 1) & min_dup == 1 -//DP: 6548 cases *Move observations: replace stm = stm-1 if move == 1 & hh_gap_prev == 1 /*3,425 real changes made*/ replace stm = stm+1 if move == 1 & hh_gap_next == 1 /*3,123 real changes made*/ + *Drop households with duplicated observations, keeping observations from more recent waves if duplicated years: sort stm idperson swv drop dup @@ -284,13 +248,18 @@ by stm idperson: egen max_wave = max(swv) // Keep more recent obs gen drop_idhh = idhh if max_wave == swv & dup == 1 // This identifies idhh which should be dropped bys idhh stm: egen drop_idhh_max = max(drop_idhh) drop if !missing(drop_idhh_max) -//DP: 8,119 observations deleted -duplicates drop idperson stm, force // Few duplicates left, drop - -**************************************** +duplicates drop idperson stm, force +*/ + +/**************************************************************/ +* +* preliminaries +* +/**************************************************************/ * Setting STATA to recognize Panel Data xtset idperson stm + * total hours work per week (average) gen hours = 0 replace hours = jbhrs if ((jbhrs > 0) & (jbhrs < .)) @@ -334,9 +303,13 @@ gen yplgrs_dv_level = sinh(yplgrs_dv) gen wage_hour = . replace wage_hour = yplgrs_dv_level / hours / 4.333 if (yplgrs_dv_level >= 50 & yplgrs_dv_level <= 83333 & hours >= 1 & hours <= 100) sum wage_hour, det +fre wage_hour if wage_hour==0 +fre wage_hour if wage_hour==. *replace wage_hour = . if wage_hour < 4 | wage_hour > 70 + * relationship status (1=cohabitating) gen mar = (dcpst==1) + * children gen any02 = dnc02 > 0 gen dnc4p = dnc @@ -344,17 +317,14 @@ replace dnc4p = 1 if (dnc>4) gen dnc2p = dnc replace dnc2p = 2 if (dnc>2) cap gen child = (dnc>0) -* individual weights -by idperson: egen wgt = mean(dimlwt) -* - -/**************************************************************/ -* -* preliminaries -* -/**************************************************************/ +* individual weights +//by idperson: egen wgt = mean(dimlwt) +by idperson: egen wgt = mean(dimxwt) + +* ln wages gen lwage_hour = ln(wage_hour) + hist lwage_hour if lwage_hour > 0 & lwage_hour < 4.4 gen swage_hour = asinh(wage_hour) @@ -362,20 +332,77 @@ hist swage_hour if (swage_hour > 1 & swage_hour < 5) replace lwage_hour = . if (wage_hour<5 | wage_hour>1000) +gen lwage_hour_2 = lwage_hour^2 + +*correct employment status replace les_c3 = 3 if lwage_hour == . & les_c3 ! = 2 // PB: employment status is set on the basis of hourly wage not missing, so recode labour market activity status to match this for non-students replace les_c3 = 1 if lwage_hour != . // PB: as above, if wage present consider as employed recode deh_c3 dehm_c3 dehf_c3 drgn1 dhe (-9=.) +gen L1les_c3 = L1.les_c3 + +*part time work +gen pt = (hours > 0) * (hours <= 25) +drop hrs0_m1 hrs1_m1 + + + +***************************************************************************************************************************** +* Set Excel file +* Info sheet - first stage +putexcel set "$dir_results/reg_employmentSelection", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "This file contains regression estimates from the first stage of the Heckman selection model used to estimates wages." +putexcel A2 = "Authors: Patryk Bronka, Justin Van de Ven, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "EmploymentSelection_FemaleNE" +putexcel B5 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" +putexcel A6 = "EmploymentSelection_MaleNE" +putexcel B6 = "First stage Heckman selection estimates for women that do not have an observed wage in the previous year" +putexcel A7 = "EmploymentSelection_FemaleE" +putexcel B7 = "First stage Heckman selection estimates for women that have an observed wage in the previous year" +putexcel A8 = "EmploymentSelection_MaleE" +putexcel B8 = "First stage Heckman selection estimates for men that have an observed wage in the previous year" + +putexcel A11 = "Notes:", bold +putexcel B11 = "Estimated on panel data unlike the labour supply estimates" +putexcel B12 = "Predicted wages used as input into union parameters and income process estimates" +putexcel B13 = "Two-step Heckman command is used which does not permit weights" + +* Info sheet - second stage +putexcel set "$dir_results/reg_wages", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "This file contains regression estimates used to calculate potential wages for males and females in the simulation." +putexcel A2 = "Authors: Patryk Bronka, Daria Popova" +putexcel A3 = "Last edit: 1 July 2025 DP" + +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "Wages_FemalesNE" +putexcel B5 = "Heckman selection estimates using women that do not have an observed wage in the previous year" +putexcel A6 = "Wages_MalesNE" +putexcel B6 = "Heckman selection estimates using men that do not have an observed wage in the previous year" +putexcel A7 = "Wages_FemalesE" +putexcel B7 = "Heckman selection estimates using women that have an observed wage in the previous year" +putexcel A8 = "Wages_MalesE" +putexcel B8 = "Heckman selection estimates using men that have an observed wage in the previous year" + +putexcel A11 = "Notes:", bold +putexcel B11 = "Estimated on panel data unlike the labour supply estimates" +putexcel B12 = "Predicted wages used as input into union parameters and income process estimates" +putexcel B13 = "Two-step Heckman command is used which does not permit weights" +putexcel B14 = "Regions: London is the reference region" + /**************************************************************/ * -* pooled cross-sectional regressions +* Regressions * /**************************************************************/ -gen pt = (hours > 0) * (hours <= 25) -drop hrs0_m1 hrs1_m1 - * Strategy: * 1) Heckman estimated on the sub-sample of individuals who were not observed working in previous period. * Wage equation does not controls for lagged wage @@ -384,103 +411,880 @@ drop hrs0_m1 hrs1_m1 * Specification of selection equation is the same in the two samples * Flag to identify observations to be included in the estimation sample +/* The sample should include only individuals who are observed for at least two periods, and then the first observation should not be used in the estimation. */ bys idperson: gen obs_count = _N -gen in_sample = (obs_count > 1 & swv > 1) +gen in_sample = (obs_count > 1 & swv > 1) * Flag to distinguish the two samples capture drop previouslyWorking -gen previouslyWorking = (L1.lwage_hour != .) /* PB 07.02.2023: I think this will set previosuly working to 0 for everyone -who is not observed in the previous period, e.g. all observations at Wave 1. I think the sample should include only individuals -who are observed for at least two periods, and then the first observation should not be used in the estimation. */ +gen previouslyWorking = (L1.lwage_hour != .) +fre previouslyWorking +* Prep storage capture drop lwage_hour_hat wage_hour_hat esample gen lwage_hour_hat = . gen wage_hour_hat = . gen esample = . - -gen L1les_c3 = L1.les_c3 -gen lwage_hour_2 = lwage_hour^2 - gen pred_hourly_wage = . *** 1) Heckman estimated on the sub-sample of individuals who were not observed working in previous period. **** Wage equation does not control for lagged wage - +************************************************************************************************************************** * women -global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth" -global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " +************************************************************************************************************************** +global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe +global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & !previouslyWorking" *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust) -heckman $wage_eqn if `filter', select($seln_eqn) twostep +heckman $wage_eqn if `filter', select($seln_eqn) twostep outputResults "Not-working women3" -outreg2 stats(coef se pval) using "$dir_data/Output_NWW.doc", replace /// +outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWW.doc", replace /// title("Heckman-corrected wage equation estimated on the sample of women who were not in employment last year") /// - ctitle(In education) label side dec(2) noparen - + ctitle(Not working women) label side dec(2) noparen + + *xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2) computePredicted "heckman" `filter' -analyseFit "e(sample)" "nocorr" -replace esample = 1 if e(sample) -replace pred_hourly_wage = wage_hour_hat if e(sample) +analyseFit "e(sample)" "nocorr" "Not working women, 17-64 years" "NWW" +gen in_sample_fnpw = e(sample) +replace pred_hourly_wage = wage_hour_hat if in_sample_fnpw +* Save sample for later use (internal validation) +save "$dir_validation_data/Female_NPW_sample", replace +* Formatted results +* Clean up matrix of estimates +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) + +* Second stage +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") replace +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +* Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") modify +putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) + +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow /// + clear +ds + +drop if C == 0 // UPDATE +drop A +drop AH-BM // UPDATE + + + +mkmat *, matrix(Females_NLW) +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_FemalesNE") modify +putexcel B2 = matrix(Females_NLW) + +restore + +* Labelling +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_FemalesNE") modify + +local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + + +* First stage +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Females_NLW") firstrow /// + clear +ds + +drop if AN == 0 // UPDATE +drop A +drop C-AG // UPDATE +drop BN // UPDATE + + +mkmat *, matrix(Females_NLW) +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_FemaleNE") modify +putexcel B2 = matrix(Females_NLW) + +restore + +* Labelling +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_FemaleNE") modify + +local var_list Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +cap drop lambda + + +* Calculate RMSE +cap drop residuals squared_residuals +gen residuals = lwage_hour - lwage_hour_hat +gen squared_residuals = residuals^2 + +preserve +keep if `filter' +sum squared_residuals +di "RMSE for Not employed women: " sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// +A2=("Wages_FemalesNE") B2=(sqrt(r(mean))) +restore + + +**************************************************************************************************************************** * men -global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth" -global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " +**************************************************************************************************************************** +global wage_eqn "lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe +global seln_eqn "i.L1les_c3 dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & !previouslyWorking" *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust) -heckman $wage_eqn if `filter', select($seln_eqn) twostep +heckman $wage_eqn if `filter', select($seln_eqn) twostep outputResults "Not-working men3" -outreg2 stats(coef se pval) using "$dir_data/Output_NWM.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of men who were not in employment in the previous year") /// -ctitle(Wage equation coef.) label side dec(2) noparen - +outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_NWM.doc", replace /// +title("Heckman-corrected wage equation estimated on the sample of men who were not in employment last year") /// + ctitle(Not working men) label side dec(2) noparen + + +*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2) computePredicted "heckman" `filter' -analyseFit "e(sample)" "nocorr" -replace esample = 1 if e(sample) -replace pred_hourly_wage = wage_hour_hat if e(sample) +analyseFit "e(sample)" "nocorr" "Not working men, 17-64 years" "NWM" +gen in_sample_mnpw = e(sample) +replace pred_hourly_wage = wage_hour_hat if in_sample_mnpw + +* Save sample for later use (internal validation) +save "$dir_validation_data/Male_NPW_sample", replace + +* Formatted results +* Clean up matrix of estimates +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) + +* Second stage +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") replace +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +* Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") modify +putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) + +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow /// + clear +ds + +drop if C == 0 // UPDATE +drop A +drop AH-BM // UPDATE + + + +mkmat *, matrix(Males_NLW) +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_MalesNE") modify +putexcel B2 = matrix(Males_NLW) + +restore + +* Labelling +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_MalesNE") modify + +local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + + +* First stage +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Males_NLW") firstrow /// + clear +ds + +drop if AN == 0 // UPDATE +drop A +drop C-AG // UPDATE +drop BN // UPDATE + + +mkmat *, matrix(Males_NLW) +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_MaleNE") modify +putexcel B2 = matrix(Males_NLW) + +restore + +* Labelling +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_MaleNE") modify + +local var_list Les_c3_NotEmployed_L1 Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +cap drop lambda + +* Calculate RMSE +cap drop residuals squared_residuals +gen residuals = lwage_hour - lwage_hour_hat +gen squared_residuals = residuals^2 + +preserve +keep if `filter' +sum squared_residuals +di "RMSE for Not employed men: " sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// +A3=("Wages_MalesNE") B3=(sqrt(r(mean))) +restore + *** 2) Heckman estimated on the sub-sample of individuals who were observed working in previous period. *** Wage equation controls for lagged wage - +*************************************************************************************************************************************** * women -global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth" -global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1 " -local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & swv > 1 & previouslyWorking" +*************************************************************************************************************************************** +global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe +global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe +local filter = "dgn==0 & dag>=$min_age & dag<=$max_age & previouslyWorking" *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust) -heckman $wage_eqn if `filter', select($seln_eqn) twostep +heckman $wage_eqn if `filter', select($seln_eqn) twostep outputResults "Working women3" -outreg2 stats(coef se pval) using "$dir_data/Output_WW.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of women who were in employment in the previous year") /// - ctitle(Wage equation coef.) label side dec(2) noparen - +outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WW.doc", replace /// +title("Heckman-corrected wage equation estimated on the sample of women who were in employment last year") /// + ctitle(Working women) label side dec(2) noparen + + +*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2) computePredicted "heckman" `filter' -analyseFit "e(sample)" -replace esample = 1 if e(sample) -replace pred_hourly_wage = wage_hour_hat if e(sample) +analyseFit "e(sample)" "nocorr" "Working women, 17-64 years" "WW" +gen in_sample_fpw = e(sample) +replace pred_hourly_wage = wage_hour_hat if in_sample_fpw + +* Save sample for later use (internal validation) +save "$dir_validation_data/Female_PW_sample", replace + +* Formatted results +* Clean up matrix of estimates +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) + +* Store variance-covariance matrix +preserve +putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) + +* Second stage +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") replace +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +* Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Females_LW") modify +putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) + +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow /// + clear +ds + +drop if C == 0 // UPDATE +drop A +drop AI-BM // UPDATE + + +mkmat *, matrix(Females_LW) +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_FemalesE") modify +putexcel B2 = matrix(Females_LW) + +restore + +* Labelling +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_FemalesE") modify + +local var_list L1_log_hourly_wage Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + + +* First stage +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Females_LW") firstrow /// + clear +ds + +drop if AO == 0 // UPDATE +drop A +drop C-AH // UPDATE +drop BN // UPDATE + + +mkmat *, matrix(Females_LW) +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_FemaleE") modify +putexcel B2 = matrix(Females_LW) + +restore + +* Labelling +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_FemaleE") modify + +local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +cap drop lambda + + +* Calculate RMSE +cap drop residuals squared_residuals +gen residuals = lwage_hour - lwage_hour_hat +gen squared_residuals = residuals^2 + +preserve +keep if `filter' +sum squared_residuals +di "RMSE for Employed women: " sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// +A4=("Wages_FemalesE") B4=(sqrt(r(mean))) +restore + + +**************************************************************************************************************************************** * men -global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd i.dhe ib8.drgn1 pt real_wage_growth" -global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd i.dhe ib8.drgn1" -local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & swv > 1 & previouslyWorking" +**************************************************************************************************************************************** +global wage_eqn "lwage_hour L1.lwage_hour dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 pt real_wage_growth y2020 y2021 i.dot" //i.dhe +global seln_eqn "dag dagsq i.deh_c3 i.deh_c3#c.dag ded i.dehmf_c3 mar child dlltsd01 dhe_pcs dhe_mcs ib8.drgn1 y2020 y2021 i.dot" //i.dhe +local filter = "dgn==1 & dag>=$min_age & dag<=$max_age & previouslyWorking" *heckman $wage_eqn if `filter' [pweight=dimxwt], select($seln_eqn) vce(robust) -heckman $wage_eqn if `filter', select($seln_eqn) twostep +heckman $wage_eqn if `filter', select($seln_eqn) twostep outputResults "Working men3" -outreg2 stats(coef se pval) using "$dir_data/Output_WM.doc", replace /// -title("Heckman-corrected wage equation estimated on the sample of men who were in employment in the previous year") /// - ctitle(Wage equation coef.) label side dec(2) noparen +outreg2 stats(coef se pval) using "$dir_raw_results/wages/Output_WM.doc", replace /// +title("Heckman-corrected wage equation estimated on the sample of men who were in employment last year") /// + ctitle(Working women) label side dec(2) noparen + + +*xtheckmanfe $wage_eqn if `filter', select($seln_eqn) reps(2) +computePredicted "heckman" `filter' +analyseFit "e(sample)" "nocorr" "Working men, 17-64 years" "WM" +gen in_sample_mpw = e(sample) +replace pred_hourly_wage = wage_hour_hat if in_sample_mpw + +* Save sample for later use (internal validation) +save "$dir_validation_data/Male_PW_sample", replace +* Formatted results +* Clean up matrix of estimates +* Note: Zeros values are eliminated +matrix b = e(b) +matrix V = e(V) -computePredicted "heckman" `filter' -analyseFit "e(sample)" -replace esample = 1 if e(sample) -replace pred_hourly_wage = wage_hour_hat if e(sample) +* Store variance-covariance matrix +preserve + +putexcel set "$dir_raw_results/wages/var_cov", sheet("var_cov") replace +putexcel A1 = matrix(V) + +import excel "$dir_raw_results/wages/var_cov", sheet("var_cov") clear + +describe +local no_vars = `r(k)' + +forvalues i = 1/2 { + egen row_sum = rowtotal(*) + drop if row_sum == 0 + drop row_sum + xpose, clear +} + +mkmat v*, matrix(var) + +* Second stage +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") replace +putexcel C2 = matrix(var) + +restore + +* Store estimated coefficients +* Initialize a counter for non-zero coefficients +local non_zero_count = 0 +//local names : colnames b + +* Loop through each element in `b` to count non-zero coefficients +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + local non_zero_count = `non_zero_count' + 1 + } +} + +* Create a new row vector to hold only non-zero coefficients +matrix nonzero_b = J(1, `non_zero_count', .) + +* Populate nonzero_b with non-zero coefficients from b +local index = 1 +forvalues i = 1/`no_vars' { + if (b[1, `i'] != 0) { + matrix nonzero_b[1, `index'] = b[1, `i'] + local index = `index' + 1 + } +} + +putexcel set "$dir_raw_results/wages/reg_wages", sheet("Males_LW") modify +putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) + +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow /// + clear +ds + +drop if C == 0 // UPDATE +drop A +drop AI-BM // UPDATE + + +mkmat *, matrix(Males_LW) +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_MalesE") modify +putexcel B2 = matrix(Males_LW) + +restore +* Labelling +putexcel set "$dir_results/reg_wages", /// + sheet("UK_Wages_MalesE") modify + +local var_list L1_log_hourly_wage Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dlltsd01 dhe_pcs dhe_mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Pt RealWageGrowth Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant InverseMillsRatio + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + + +* First stage +preserve + +import excel "$dir_raw_results/wages/reg_wages", sheet("Males_LW") firstrow /// + clear +ds + +drop if AO == 0 // UPDATE +drop A +drop C-AH // UPDATE +drop BN // UPDATE + + +mkmat *, matrix(Males_LW) +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_MaleE") modify +putexcel B2 = matrix(Males_LW) + +restore + +* Labelling +putexcel set "$dir_results/reg_employmentSelection", /// + sheet("UK_EmploymentSelection_MaleE") modify + +local var_list Dag Dag_sq Deh_c3_Medium Deh_c3_Low Deh_c3_Medium_Dag /// + Deh_c3_Low_Dag Ded Dehmf_c3_Medium Dehmf_c3_Low Dcpst_Partnered D_Children Dlltsd01 Dhe_Pcs Dhe_Mcs /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN Y2020 Y2021 /// + Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} + +cap drop lambda + + +* Calculate RMSE +cap drop residuals squared_residuals +gen residuals = lwage_hour - lwage_hour_hat +gen squared_residuals = residuals^2 + +preserve +keep if `filter' +sum squared_residuals +di "RMSE for Employed men: " sqrt(r(mean)) +putexcel set "$dir_results/reg_RMSE.xlsx", sheet("UK") modify +putexcel A1=("REGRESSOR") B1=("COEFFICIENT") /// +A5=("Wages_MalesE") B5=(sqrt(r(mean))) +restore + + +sum wage_hour if wage_hour >0& stm==19, d +sum pred_hourly_wage if pred_hourly_wage >0& stm==19, d + +/* +****************************************************************************************************************************************** * all analyseFit "esample == 1" analyseFit "esample == 1 & dgn == 0" // women @@ -496,15 +1300,14 @@ forvalues year = 11/23 { analyseFit2 "esample == 1 & dgn == 1 & deh_c3 == 1 & stm == `year'" "nocorr" "Year 20`year' men prv emp high ed" "men_highed_`year'_graph.png" // men } - +*/ // Note: sigma reported in the estimated regressions is the standard deviation of the residuals (=RMSE, assuming residuals are normally distributed) *** Save for use in the do file estimating non-employment income replace pred_hourly_wage = exp(lwage_hour) if missing(pred_hourly_wage) -save "$dir_ukhls_data/ukhls_pooled_all_obs.dta", replace - +save "$dir_ukhls_data/ukhls_pooled_all_obs_10.dta", replace *** Calculate the proportion of "true zero" hours of work among those in the "ZERO" weekly hours of labour supply bracket. *I.e. the share of zero hours among 0-5 hours for those at risk of work. diff --git a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do new file mode 100644 index 000000000..bfaa412f1 --- /dev/null +++ b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do @@ -0,0 +1,321 @@ + +xtset idperson swv + +* -------------------------------------------- +* 1. Handle Missing Values and Basic Setup +* -------------------------------------------- + +// Recode -9 as missing for all variables +foreach var of varlist _all { + replace `var' = . if `var' == -9 +} + +// Sort data by individual and wave +sort idperson swv + +// Recode year to two-digit format +replace stm = stm - 2000 + +// cap generate COVID year dummies +cap cap gen y2020 = (stm == 20) +cap cap gen y2021 = (stm == 21) + + +* -------------------------------------------- +* 2. Correct Inconsistencies +* -------------------------------------------- + +// Fix inconsistent student coding +replace ded = 0 if idperson == idperson[_n-1] & ded == 1 & ded[_n-1] == 0 + + +* -------------------------------------------- +* 3. Construct New Variables +* -------------------------------------------- + +// Partnership status in the first year +cap cap gen new_rel = 0 if dcpst == 1 +replace new_rel = 1 if dcpen == 1 +label var new_rel "Partnerhip in first year" + +// Household type: 8 categories +cap cap gen dhhtp_c8 = . +label var dhhtp_c8 "Household Type: 8 Category" +replace dhhtp_c8 = 1 if dhhtp_c4 == 1 & lessp_c3 == 1 +replace dhhtp_c8 = 2 if dhhtp_c4 == 1 & lessp_c3 == 2 +replace dhhtp_c8 = 3 if dhhtp_c4 == 1 & lessp_c3 == 3 +replace dhhtp_c8 = 4 if dhhtp_c4 == 2 & lessp_c3 == 1 +replace dhhtp_c8 = 5 if dhhtp_c4 == 2 & lessp_c3 == 2 +replace dhhtp_c8 = 6 if dhhtp_c4 == 2 & lessp_c3 == 3 +replace dhhtp_c8 = 7 if dhhtp_c4 == 3 +replace dhhtp_c8 = 8 if dhhtp_c4 == 4 +cap label define dhhtp_c8 1 "Couple with no children, spouse employed" 2 "Couple with no children, spouse student" 3 "Couple with no children, spouse not employed" 4 "Couple with children, spouse employed" 5 "Couple with children, spouse student" 6 "Couple with children, spouse not employed" 7 "Single with no children" 8 "Single with children" +label values dhhtp_c8 dhhtp_c8 + +// Squared income variable +cap cap gen ypnbihs_dv_sq = ypnbihs_dv^2 +label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared" + +// Dummy for receiving capital income +cap cap gen receives_ypncp = (ypncp > 0 & !missing(ypncp)) + +// Transform capital income from IHS to level + log +cap drop ypncp_lvl +cap gen ypncp_lvl = sinh(ypncp) +cap gen ln_ypncp = ln(ypncp_lvl) + +// Dummy and transformation for private pension income +cap drop ypnoab_lvl +cap gen ypnoab_lvl = sinh(ypnoab) +cap cap gen ln_ypnoab = ln(ypnoab_lvl) +cap cap gen receives_ypnoab = (ypnoab_lvl > 0 & !missing(ypnoab_lvl)) + +// Dummy for state pension age +cap cap gen state_pension_age = (dag >= 68) + + +* -------------------------------------------------- +* 4. Lag Variables + Handle Missing Lags at Age 16 +* -------------------------------------------------- + +// Create basic lags +sort idperson swv +cap cap gen l_ydses_c5 = ydses_c5[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dhe = dhe[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_les_c3 = les_c3[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_lesnr_c2 = lesnr_c2[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dhhtp_c4 = dhhtp_c4[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dhe_pcs = dhe_pcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dhe_mcs = dhe_mcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dlltsd = dlltsd[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap cap gen l_dlltsd01 = dlltsd01[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 + +// Fill in missing lags using current values at age 16 +gsort +idperson -stm +bys idperson: carryforward dhe if dag <= 16, replace +bys idperson: carryforward dhe_pcs if dag <= 16, replace +bys idperson: carryforward dhe_mcs if dag <= 16, replace + +sort idperson swv +cap drop dhe_L1 +bys idperson: gen dhe_L1 = l.dhe +replace dhe_L1 = dhe if missing(dhe_L1) + +cap drop dhe_pcs_L1 +bys idperson: gen dhe_pcs_L1 = l.dhe_pcs +replace dhe_pcs_L1 = dhe_pcs if missing(dhe_pcs_L1) + +cap drop dhe_mcs_L1 +bys idperson: gen dhe_mcs_L1 = l.dhe_mcs +replace dhe_mcs_L1 = dhe if missing(dhe_mcs_L1) + +cap drop yplgrs_dv_L1 +bys idperson: gen yplgrs_dv_L1 = l.yplgrs_dv +replace yplgrs_dv_L1 = yplgrs_dv if missing(yplgrs_dv_L1) + +cap drop yplgrs_dv_L2 +bys idperson: gen yplgrs_dv_L2 = l2.yplgrs_dv +replace yplgrs_dv_L2 = yplgrs_dv if missing(yplgrs_dv_L2) + +cap drop ypncp_L1 +bys idperson: gen ypncp_L1 = l.ypncp +replace ypncp_L1 = ypncp if missing(ypncp_L1) + +cap drop ypncp_L2 +bys idperson: gen ypncp_L2 = l2.ypncp +replace ypncp_L2 = ypncp if missing(ypncp_L2) + +cap drop ypnoab_L1 +bys idperson: gen ypnoab_L1 = l.ypnoab +replace ypnoab_L1 = ypnoab if missing(ypnoab_L1) + +cap drop ypnoab_L2 +bys idperson: gen ypnoab_L2 = l2.ypnoab +replace ypnoab_L2 = ypnoab if missing(ypnoab_L2) + +cap drop dhhtp_c4_L1 +bys idperson: gen dhhtp_c4_L1 = l.dhhtp_c4 +replace dhhtp_c4_L1 = dhhtp_c4 if missing(dhhtp_c4_L1) + +cap drop les_c3_L1 +bys idperson: gen les_c3_L1 = l.les_c3 +replace les_c3_L1 = les_c3 if missing(les_c3_L1) + + +* -------------------------------------------------- +* 4. Labelling +* -------------------------------------------------- + +* Label definitions +cap label define jbf 1 "Employed" 2 "Student" 3 "Not Employed" +cap label define jbg 1 "Employed" 2 "Student" 3 "Not employed" 4 "Retired" +cap label define edd 1 "Degree" 2 "Other Higher/A-level/GCSE" 3 "Other/No Qualification" +cap label define hht 1 "Couples with No Children" 2 "Couples with Children" 3 "Single with No Children" 4 "Single with Children" +cap label define gdr 1 "Male" 0 "Female" +cap label define rgna 1 "North East" 2 "North West" 4 "Yorkshire and the Humber" 5 "East Midlands" 6 "West Midlands" 7 "East of England" 8 "London" 9 "South East" 10 "South West" 11 "Wales" 12 "Scotland" 13 "Northern Ireland" +cap label define yn 1 "Yes" 0 "No" +cap label define dces 1 "Both Employed" 2 "Employed, Spouse Not Employed" 3 "Not Employed, Spouse Employed" 4 "Both Not Employed" +cap label define ethn 1 "White" 2 "Asian or Asian British" 3 "Black, Black British, Caribbean, or African" 4 "Other or missing ethnic group" +cap label define dhe 1 "Poor" 2 "Fair" 3 "Good" 4 "Very Good" 5 "Excellent", modify + +* Variable labels +label variable dgn "cap gender" +label variable dag "Age" +label variable dagsq "Age Squared" +label variable drgn1 "Region" +label variable stm "Year" +label variable les_c3 "Employment Status: 3 Category" +label variable les_c4 "Employment Status: 4 Category" +label variable dhe "Self-rated Health" +label variable dcpen "Entered a new Partnership" +label variable dcpex "Partnership dissolution" +label variable deh_c3 "Educational Attainment: 3 Category" +label variable ydses_c5 "Annual Household Income Quintile" +label variable dlltsd "Long-term Sick or Disabled" +label variable dhhtp_c4 "Household Type: 4 Category" +label variable dhhtp_c8 "Household Type: 8 Category" +label variable dnc "Number of Children in Household" +label variable dnc02 "Number of Children aged 0-2 in Household" +label variable dot "Ethnicity" +label variable dehmf_c3 "Highest Parental Educational Attainment: 3 Category" +label variable dhe_mcs "Subjective Self-rated health - Mental (SF12 MCS)" +label variable dhe_pcs "Subjective Self-rated health - Physical (SF12 PCS)" +label variable dagpns "Reached state retirement age" +label variable dagpns_sp "Reached state retirement age - partner" +label variable dukfr "UK Fertility Rate" +label variable lesdf_c4 "Differential Employment Status" +label variable ypnbihs_dv "Personal Non-benefit Gross Income" +label variable ynbcpdf_dv "Differential Personal Non-Benefit Gross Income" + +* Attach value labels to variables +label values dgn gdr +label values drgn1 rgna +label values les_c3 lessp_c3 jbf +label values les_c4 jbg +label values deh_c3 dehsp_c3 edd +label values dcpen dcpex yn +label values lesdf_c4 dces +label values dhhtp_c4 hht +label values dhhtp_c8 dhhtp_c8 +label values dot ethn +label values dhe dhe +label value ded yn +label value dlltsd yn +label value dlltsd01 yn + +* Alter names and create dummies for automatic labelling +*(required for gologit) + +cap gen Dgn = dgn +cap gen Dag = dag +cap gen Dag_sq = dagsq + + +capture drop UK* +capture drop Deh_c3_* +capture drop Dehmf_c3_* +capture drop Les_c4_* +capture drop L_Les_c3_* +capture drop Ydses_c5_Q* +capture drop L_Ydses_c5_Q* +capture drop Dhe_* +capture drop L_Dhe_c5_* +capture drop Dhhtp_c4_* +capture drop L_Dhhtp_c4_* +capture drop dot_* +cap drop Ethn_White Ethn_Asian Ethn_Black Ethn_Other + +tab drgn1, gen(UK) +rename UK1 UKC //North East +rename UK2 UKD //North West +rename UK3 UKE //Yorkshire and the Humber +rename UK4 UKF //East Midlands +rename UK5 UKG //West Midlands +rename UK6 UKH //East of England +rename UK7 UKI //London +rename UK8 UKJ //South East +rename UK9 UKK //South West +rename UK10 UKL //Wales +rename UK11 UKM //Scotland +rename UK12 UKN //Northern Ireland + +tab deh_c3, gen(Deh_c3_) +rename Deh_c3_1 Deh_c3_High +rename Deh_c3_2 Deh_c3_Medium +rename Deh_c3_3 Deh_c3_Low + +tab dehmf_c3, gen(Dehmf_c3_) +rename Dehmf_c3_1 Dehmf_c3_High +rename Dehmf_c3_2 Dehmf_c3_Medium +rename Dehmf_c3_3 Dehmf_c3_Low + +tab les_c4, gen(Les_c4_) +rename Les_c4_1 Les_c4_Employed +rename Les_c4_2 Les_c4_Student +rename Les_c4_3 Les_c4_NotEmployed +rename Les_c4_4 Les_c4_Retired + +tab l_les_c3, gen(L_Les_c3_) +rename L_Les_c3_1 L_Les_c3_Employed +rename L_Les_c3_2 L_Les_c3_Student +rename L_Les_c3_3 L_Les_c3_NotEmployed + +tab ydses_c5, gen(Ydses_c5_Q) + +tab l_ydses_c5, gen(L_Ydses_c5_Q) + +tab dhe, gen(Dhe_) +rename Dhe_1 Dhe_Poor +rename Dhe_2 Dhe_Fair +rename Dhe_3 Dhe_Good +rename Dhe_4 Dhe_VeryGood +rename Dhe_5 Dhe_Excellent + +tab l_dhe, gen(L_Dhe_c5_) + +tab dhhtp_c4, gen(Dhhtp_c4_) +rename Dhhtp_c4_1 Dhhtp_c4_CoupleNoChildren +rename Dhhtp_c4_2 Dhhtp_c4_CoupleChildren +rename Dhhtp_c4_3 Dhhtp_c4_SingleNoChildren +rename Dhhtp_c4_4 Dhhtp_c4_SingleChildren + +tab l_dhhtp_c4, gen(L_Dhhtp_c4_) +rename L_Dhhtp_c4_1 L_Dhhtp_c4_CoupleNoChildren +rename L_Dhhtp_c4_2 L_Dhhtp_c4_CoupleChildren +rename L_Dhhtp_c4_3 L_Dhhtp_c4_SingleNoChildren +rename L_Dhhtp_c4_4 L_Dhhtp_c4_SingleChildren + +tab dot, gen(dot_) +rename dot_1 Ethn_White +rename dot_2 Ethn_Asian +rename dot_3 Ethn_Black +rename dot_4 Ethn_Other + + + + +cap gen Year_transformed = stm + +cap gen Y2020 = y2020 +cap gen Y2021 = y2021 + +cap gen Dhe = dhe +cap gen Dhe_pcs = dhe_pcs +cap gen Dhe_mcs = dhe_mcs + +cap gen Ydses_c5 = ydses_c5 + +cap gen L_Ydses_c5 = l_ydses_c5 + +cap gen L_Dhe = l_dhe +cap gen L_Dhe_pcs = l_dhe_pcs +cap gen L_Dhe_mcs = l_dhe_mcs + +cap gen Dlltsd = dlltsd +cap gen Dlltsd01 = dlltsd01 + +cap gen L_Dlltsd = l_dlltsd +cap gen L_Dlltsd01 = l_dlltsd01 + + + diff --git a/input/reg_RMSE.xlsx b/input/reg_RMSE.xlsx index 8d9d984a4..b9ebaa906 100644 Binary files a/input/reg_RMSE.xlsx and b/input/reg_RMSE.xlsx differ diff --git a/input/reg_education.xlsx b/input/reg_education.xlsx index c4c96c2c4..2192acf6f 100644 Binary files a/input/reg_education.xlsx and b/input/reg_education.xlsx differ diff --git a/input/reg_employmentSelection.xlsx b/input/reg_employmentSelection.xlsx index e3734b6ac..6f9c0bf55 100644 Binary files a/input/reg_employmentSelection.xlsx and b/input/reg_employmentSelection.xlsx differ diff --git a/input/reg_fertility.xlsx b/input/reg_fertility.xlsx index f9dac777c..1fad1a58b 100644 Binary files a/input/reg_fertility.xlsx and b/input/reg_fertility.xlsx differ diff --git a/input/reg_health.xlsx b/input/reg_health.xlsx index 74994f26e..115322b6e 100644 Binary files a/input/reg_health.xlsx and b/input/reg_health.xlsx differ diff --git a/input/reg_home_ownership.xlsx b/input/reg_home_ownership.xlsx index 4803ba8a7..7ece031c5 100644 Binary files a/input/reg_home_ownership.xlsx and b/input/reg_home_ownership.xlsx differ diff --git a/input/reg_income.xlsx b/input/reg_income.xlsx index f10d898aa..b37671688 100644 Binary files a/input/reg_income.xlsx and b/input/reg_income.xlsx differ diff --git a/input/reg_labourSupplyUtility.xlsx b/input/reg_labourSupplyUtility.xlsx index db48d381d..145ead6bb 100644 Binary files a/input/reg_labourSupplyUtility.xlsx and b/input/reg_labourSupplyUtility.xlsx differ diff --git a/input/reg_leave_parental_home.xlsx b/input/reg_leave_parental_home.xlsx new file mode 100644 index 000000000..2333c34cd Binary files /dev/null and b/input/reg_leave_parental_home.xlsx differ diff --git a/input/reg_partnership.xlsx b/input/reg_partnership.xlsx index 9c689b853..6d390b368 100644 Binary files a/input/reg_partnership.xlsx and b/input/reg_partnership.xlsx differ diff --git a/input/reg_retirement.xlsx b/input/reg_retirement.xlsx index 08663ae2b..4c9092d84 100644 Binary files a/input/reg_retirement.xlsx and b/input/reg_retirement.xlsx differ diff --git a/input/reg_wages.xlsx b/input/reg_wages.xlsx index 0fdacc128..1e0f1950b 100644 Binary files a/input/reg_wages.xlsx and b/input/reg_wages.xlsx differ