diff --git a/input/InitialPopulations/compile/00_master.do b/input/InitialPopulations/compile/00_master.do index b7509704d..434ef7fcd 100644 --- a/input/InitialPopulations/compile/00_master.do +++ b/input/InitialPopulations/compile/00_master.do @@ -8,7 +8,7 @@ * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * WAS EUL version - UKDA-7215-stata [to wave 7] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 18 July 2025 DP +* LAST UPDATE: 4 Nov 2025 DP *************************************************************************************** *************************************************************************************** @@ -36,32 +36,35 @@ set matsize 1000 *************************************************************************************/ * Working directory -*global dir_work "C:\MyFiles\99 DEV ENV\JAS-MINE\data work\initial_populations" -*global dir_work "C:\Users\Patryk\Documents\SP_prep_pop" -global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations" +global dir_work "C:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\initial_populations" * Directory which contains do files global dir_do "${dir_work}/do" -*global dir_do "C:\Users\Patryk\git\SimPathsFork\input\InitialPopulations\compile" -* Directory which contains data files -global dir_data "${dir_work}/data" +* Directory which contains processed data +global dir_data "${dir_work}/data" //data * Directory which contains log files global dir_log "${dir_work}/log" * Directory which contains UKHLS data -*global dir_ukhls_data "J:\01 DATA\UK\ukhls\wave14\stata\stata13_se\ukhls" -global dir_ukhls_data "D:\Dasha\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls" -*global dir_ukhls_data "C:\Users\Patryk\Documents\SP_prep_pop\ukhls\UKDA-6614-stata\stata\stata13_se\ukhls" +global dir_ukhls_data "D:\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\ukhls" //original_data + +* Directory which contains BHPS data +global dir_bhps_data "D:\UK-original-data\USoc\UKDA-6614-stata\stata\stata13_se\bhps" //original_data_bhps * Directory which contains WAS data -*global dir_was_data "J:\01 DATA\UK\was\wave7\stata\stata13_se" -global dir_was_data "D:\Dasha\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se" -*global dir_was_data "C:\Users\Patryk\Documents\WAS\UKDA-7215-stata\stata\stata13_se" +global dir_was_data "D:\UK-original-data\WAS\UKDA-7215-stata\stata\stata13_se" + +//additional paths to employment history files +* Directory which contains processed employment history data +global dir_data_emphist "${dir_data}/emphist" //data_emphist + +* Directory which contains employment history do-files +global dir_do_emphist "${dir_do}/do_emphist" -* Directory which contains original initial popultions -global dir_ipop_orig "${dir_work}/original_initial_populations" +* Directory which contains employment history log files +global dir_log_emphist "${dir_log}/emphist" //log_emphist /************************************************************************************** @@ -88,7 +91,15 @@ wave 12 l 2020-2022 wave 13 m 2021-2023 wave 14 n 2022-2024 */ -global UKHLSwaves "a b c d e f g h i j k l m n" /*all waves*/ + +global UKHLSwaves "a b c d e f g h i j k l m n" /*all waves*/ //ukhls_all_waves +global UKHLSwaves_numbers "1 2 3 4 5 6 7 8 9 10 11 12 13 14" //ukhls_all_waves_numbers + +global UKHLS_panel_waves "b c d e f g h i j k l m n" +global UKHLS_panel_waves_numbers "2 3 4 5 6 7 8 9 10 11 12 13 14" //ukhls_waves_numbers +global UKHLS_waves_prefixed "b_ c_ d_ e_ f_ g_ h_ i_ j_ k_ l_ m_ n_" +global BHPS_waves "l m n o p q r" + * waves reporting social care module in ukhls - ADL questions added from wave 7 and then every other wave (from 2016) global scRecWaves "g i k m" * waves reporting social care provided in ukhls (from 2015) @@ -118,8 +129,10 @@ do "${dir_do}/07_was_wealth_data.do" forvalues year = $wealthStartYear / $wealthEndYear { global yearWealth = `year' do "${dir_do}/08_wealth_to_ukhls.do" -} +} +*check data and slice into initial populations do "${dir_do}/09_finalise_input_data.do" +*descriptives for initial populations and full sample do "${dir_do}/10_check_yearly_data.do" diff --git a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do index e8f0b2f47..bff821225 100644 --- a/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do +++ b/input/InitialPopulations/compile/01_prepare_UKHLS_pooled_data.do @@ -30,12 +30,12 @@ foreach w of global UKHLSwaves { local waveno=strpos("abcdefghijklmnopqrstuvwxyz","`w'") if (`waveno'<13) { - use pidp `w'_ivfho `w'_ivfio `w'_hhorig `w'_buno_dv `w'_dvage `w'_sex `w'_depchl `w'_hidp `w'_pno `w'_pns1pid `w'_pns2pid `w'_month `w'_intdaty_dv /// + use pidp `w'_ivfho `w'_ivfio `w'_hhorig `w'_memorig `w'_buno_dv `w'_dvage `w'_sex `w'_depchl `w'_hidp `w'_pno `w'_pns1pid `w'_pns2pid `w'_month `w'_intdaty_dv /// `w'_mnspid `w'_fnspid `w'_ppid `w'_ppno `w'_sppid `w'_sex_dv `w'_mastat_dv `w'_gor_dv `w'_age_dv /* `w'_hgbioad1 `w'_hgbioad2 */ /// `w'_intdatd_dv `w'_intdatm_dv `w'_intdaty_dv `w'_ethn_dv using `w'_indall.dta, clear } else { - use pidp `w'_ivfho `w'_ivfio `w'_hhorig `w'_buno_dv `w'_dvage `w'_sex `w'_depchl `w'_hidp `w'_pno `w'_pns1pid `w'_pns2pid `w'_month `w'_intdaty_dv /// + use pidp `w'_ivfho `w'_ivfio `w'_hhorig `w'_memorig `w'_buno_dv `w'_dvage `w'_sex `w'_depchl `w'_hidp `w'_pno `w'_pns1pid `w'_pns2pid `w'_month `w'_intdaty_dv /// `w'_mnspid `w'_fnspid `w'_ppid `w'_ppno `w'_sppid `w'_sex_dv `w'_mastat_dv `w'_gor_dv `w'_age_dv `w'_hgbioad1 `w'_hgbioad2 /// `w'_intdatd_dv `w'_intdatm_dv `w'_intdaty_dv `w'_ethn_dv using `w'_indall.dta, clear } diff --git a/input/InitialPopulations/compile/02_create_UKHLS_variables.do b/input/InitialPopulations/compile/02_create_UKHLS_variables.do index 87a4ae85a..42aac7c80 100644 --- a/input/InitialPopulations/compile/02_create_UKHLS_variables.do +++ b/input/InitialPopulations/compile/02_create_UKHLS_variables.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 18 July 2025 DP +* LAST UPDATE: 3 Nov 2025 DP * NOTE: Called from 00_master.do - see master file for further details * Use -9 for missing values *************************************************************************************** @@ -713,28 +713,28 @@ la var der "Return to education" //fre der /*****************************Partnership status*******************************/ -recode mastat_dv (2 3 10 = 1 "Partnered") /// - (0 1 = 2 "Single never married") /// Includes children under 16 - (4 5 6 7 8 9 = 3 "Previously partnered") /// - , into (dcpst) -la var dcpst "Partnership status" -recode dcpst (-8 -2 -1 = -9) - -*If idpartner = 0 (because of household splitting), dcpst should be set to 3 depending on mastat_dv value -replace dcpst = 3 if dcpst == 1 & idpartner <= 0 -replace dcpst = 1 if idpartner > 0 & !missing(idpartner) - -//Children coded as "Never Married" (17 and under chosen as can marry from 18 years onwards) -replace dcpst = 2 if dag <= 17 & idpartner<0 -//fre dcpst +gen dcpst = . +replace dcpst = 1 if idpartner > 0 & !missing(idpartner) //partnered +replace dcpst = 2 if idpartner < 0 | missing(idpartner) +lab var dcpst "Partnership status" +lab def dcpst 1 "partnered" 2 "single" +lab val dcpst dcpst + +recode dcpst (. = -9) +/* +* Children coded as "Never Married" +Can only marry from age 18 onwards in the simulation +*/ +replace dcpst = 2 if dag <= 17 & idpartner < 0 +//fre dcpst /*****************************Enter partnership*******************************/ sort idperson swv cap drop dcpen gen dcpen = -9 -replace dcpen=0 if (l.dcpst==2 | l.dcpst==3) -replace dcpen=1 if dcpst==1 & (l.dcpst==2 | l.dcpst==3) +replace dcpen=0 if (l.dcpst==2) +replace dcpen=1 if dcpst==1 & (l.dcpst==2) la val dcpen dummy la var dcpen "Enter partnership" //fre dcpen @@ -745,7 +745,7 @@ sort idperson swv cap drop dcpex gen dcpex=-9 replace dcpex = 0 if l.dcpst==1 -replace dcpex = 1 if dcpst==3 & l.dcpst==1 +replace dcpex = 1 if dcpst==2 & l.dcpst==1 la val dcpex dummy la var dcpex "Exit partnership" //fre dcpex @@ -757,7 +757,7 @@ la var dcpagdf "Partner's age difference" /*********************************Activity status*****************************/ -recode jbstat (1 2 5 12 13 14 = 1 "Employed or self-employed") /// +recode jbstat (1 2 5 12 13 14 15 = 1 "Employed or self-employed") /// (7 = 2 "Student") /// (3 6 8 10 11 97 9 4 = 3 "Not employed") /// /*includes apprenticeships, unpaid family business, govt training scheme+retired */ , into(les_c3) @@ -767,7 +767,7 @@ la var les "Activity status" replace les_c3 = 2 if dag <= 16 //People below age to leave home are not at risk of work so set activity status to not employed if not a student replace les_c3 = 3 if dag < $age_become_responsible & les_c3 != 2 - +//fre les_c3 /***********************Activity status variable adding retirement*************/ *Generate les_c4 variable in addition to the les_c3 variable. Les_c4 adds retired status. @@ -777,8 +777,7 @@ replace les_c4 = 4 if jbstat==4 lab var les_c4 "LABOUR MARKET: Activity status" lab define les_c4 1 "Employed or self-employed" 2 "Student" 3 "Not employed" 4 "Retired" lab val les_c4 les_c4 -//tab2 les_c3 les_c4 - +//fre les_c4 /****************************Partner's activity status:***********************/ preserve @@ -870,19 +869,88 @@ bys swv idhh: egen dnc = sum(depChild) *drop depChild la var dnc "Number of dependent children 0 - 18" +/****************************Pension Age***************************************/ +/*cap gen bdt = mdy(1, 15, birthy) /*month of birth is available in special license only*/ +*/ +/*State Retirement Ages for Men in the UK (2009-2023): + +2009-2010: 65 +2010-2011: 65 +2011-2012: 65 +2012-2013: 65 +2013-2014: 65 +2014-2015: 65 +2015-2016: 65 +2016-2017: 65 +2017-2018: 65 +2018-2019: 65 +2019-2020: 65 +2020-2021: 66 +2021-2022: 66 +2022-2023: 66 + +State Retirement Ages for Women in the UK (2009-2023): + +2009-2010: 60 +2010-2011: 60 +2011-2012: 60 +2012-2013: 61 +2013-2014: 61 +2014-2015: 62 +2015-2016: 62 +2016-2017: 63 +2017-2018: 63 +2018-2019: 64 +2019-2020: 65 +2020-2021: 65 +2021-2022: 66 +2022-2023: 66 +*/ +gen dagpns = 0 +//for men +replace dagpns = 1 if dgn==1 & dag>=65 & stm>=2009 & stm<2020 +replace dagpns = 1 if dgn==1 & dag>=66 & stm>=2020 +//for women +replace dagpns = 1 if dgn==0 & dag>=60 & stm>=2009 & stm<2012 +replace dagpns = 1 if dgn==0 & dag>=61 & stm>=2012 & stm<2014 +replace dagpns = 1 if dgn==0 & dag>=62 & stm>=2014 & stm<2016 +replace dagpns = 1 if dgn==0 & dag>=63 & stm>=2016 & stm<2018 +replace dagpns = 1 if dgn==0 & dag>=64 & stm>=2018 & stm<2019 +replace dagpns = 1 if dgn==0 & dag>=65 & stm>=2019 & stm<2021 +replace dagpns = 1 if dgn==0 & dag>=66 & stm>=2021 +//fre dagpns + +/****************************Pension age of a spouse***************************/ +preserve +keep swv idperson idhh dagpns +rename dagpns dagpns_sp +rename idperson idpartner +save "$dir_data/temp_dagpns", replace +restore +merge m:1 swv idpartner idhh using "$dir_data/temp_dagpns" +keep if _merge == 1 | _merge == 3 +la var dagpns_sp "Pension age - partner" +drop _merge +replace dagpns_sp=-9 if idpartner<0 + /*******************************Flag for adult children***********************/ +//add parental ages & retirement status preserve keep if dgn == 0 -keep swv idhh idperson dag +keep swv idhh idperson dag dagpns les_c4 rename idperson idmother rename dag dagmother +rename dagpns dagpnsmother +rename les_c4 les_c4mother save "$dir_data/temp_mother_dag", replace restore, preserve keep if dgn == 1 -keep swv idhh idperson dag +keep swv idhh idperson dag dagpns les_c4 rename idperson idfather rename dag dagfather +rename dagpns dagpnsfather +rename les_c4 les_c4father save "$dir_data/temp_father_dag", replace restore @@ -893,20 +961,37 @@ merge m:1 swv idhh idfather using "$dir_data/temp_father_dag" keep if _merge == 1 | _merge == 3 drop _merge -//Adult child is identified on the successful merge with mother / father in the same household and age -gen adultchildflag = (!missing(dagmother) | !missing(dagfather)) & dag >= $age_become_responsible & idpartner <= 0 -*Introduce a condition that (adult) children cannot be older than parents-15 year of age -replace adultchildflag = 0 if dag >= dagfather-15 | dag >= dagmother-15 +/*Individual is considered as adult child if +- they have at least one parent in the household (i.e. non-missing parental age) +- aged 18+ +- do not have a partner living in the same household +- is at least 15 years younger than either of their parents +- neither of their parents is of the state retirement age in that particular year & neither is retired +*/ +gen adultchildflag = (!missing(dagmother) | !missing(dagfather)) & dag >= $age_become_responsible & idpartner <= 0 +replace adultchildflag = 0 if dag >= dagfather-15 & dag >= dagmother-15 //was previously or ==> replaced with & +//fre adultchildflag +replace adultchildflag = 0 if (dagpnsmother==1 | les_c4mother==4) & (dagpnsfather ==1 | les_c4father==4) +tab2 adultchildflag swv , row + +/*Account for cases missing information +replace adultchildflag = -9 if idmother>0 & /// + (dagmother==. | dagmother<0 | les_c4mother==. | les_c4mother<0) & dag >= 17 +replace adultchildflag = -9 if idfather>0 & /// + (dagfather==. | dagfather<0 | les_c4father==. | les_c4father<0) & dag >= 17 +fre adultchildflag +2.7% have missing info on one of their parents, not sure if it is worth dropping them? +*/ /************************Household composition*********************************/ cap gen dhhtp_c4 = -9 replace dhhtp_c4 = 1 if dcpst == 1 & dnc == 0 //Couple, no children replace dhhtp_c4 = 2 if dcpst == 1 & dnc > 0 & !missing(dnc) //Couple, children -replace dhhtp_c4 = 3 if (dcpst == 2 | dcpst == 3) & (dnc == 0 | dag <= $age_become_responsible | adultchildflag== 1) +replace dhhtp_c4 = 3 if (dcpst == 2) & (dnc == 0 | dag <= $age_become_responsible | adultchildflag== 1) /*Single, no children (Note: adult children and children below age to become responsible should be assigned "no children" category, even if there are some children in the household)*/ -replace dhhtp_c4 = 4 if (dcpst == 2 | dcpst == 3) & dnc > 0 & !missing(dnc) & dhhtp_c4 != 3 //Single, children +replace dhhtp_c4 = 4 if (dcpst == 2) & dnc > 0 & !missing(dnc) & dhhtp_c4 != 3 //Single, children la def dhhtp_c4_lb 1"Couple with no children" 2"Couple with children" 3"Single with no children" 4"Single with children" la values dhhtp_c4 dhhtp_c4_lb @@ -998,75 +1083,11 @@ la var drtren "DEMOGRAPHIC: Enter retirement" //fre drtren -/****************************Pension Age***************************************/ -/*cap gen bdt = mdy(1, 15, birthy) /*month of birth is available in special license only*/ -*/ -/*State Retirement Ages for Men in the UK (2009-2023): - -2009-2010: 65 -2010-2011: 65 -2011-2012: 65 -2012-2013: 65 -2013-2014: 65 -2014-2015: 65 -2015-2016: 65 -2016-2017: 65 -2017-2018: 65 -2018-2019: 65 -2019-2020: 65 -2020-2021: 66 -2021-2022: 66 -2022-2023: 66 - -State Retirement Ages for Women in the UK (2009-2023): - -2009-2010: 60 -2010-2011: 60 -2011-2012: 60 -2012-2013: 61 -2013-2014: 61 -2014-2015: 62 -2015-2016: 62 -2016-2017: 63 -2017-2018: 63 -2018-2019: 64 -2019-2020: 65 -2020-2021: 65 -2021-2022: 66 -2022-2023: 66 -*/ -gen dagpns = 0 -//for men -replace dagpns = 1 if dgn==1 & dag>=65 & stm>=2009 & stm<2020 -replace dagpns = 1 if dgn==1 & dag>=66 & stm>=2020 -//for women -replace dagpns = 1 if dgn==0 & dag>=60 & stm>=2009 & stm<2012 -replace dagpns = 1 if dgn==0 & dag>=61 & stm>=2012 & stm<2014 -replace dagpns = 1 if dgn==0 & dag>=62 & stm>=2014 & stm<2016 -replace dagpns = 1 if dgn==0 & dag>=63 & stm>=2016 & stm<2018 -replace dagpns = 1 if dgn==0 & dag>=64 & stm>=2018 & stm<2019 -replace dagpns = 1 if dgn==0 & dag>=65 & stm>=2019 & stm<2021 -replace dagpns = 1 if dgn==0 & dag>=66 & stm>=2021 - - -/****************************Pension age of a spouse***************************/ -preserve -keep swv idperson idhh dagpns -rename dagpns dagpns_sp -rename idperson idpartner -save "$dir_data/temp_dagpns", replace -restore -merge m:1 swv idpartner idhh using "$dir_data/temp_dagpns" -keep if _merge == 1 | _merge == 3 -la var dagpns_sp "Pension age - partner" -drop _merge -replace dagpns_sp=-9 if idpartner<0 - /************************************JBSTAT: Not Retired***********************/ -gen lesnr_c2 = . -replace lesnr_c2 = 1 if (jbstat ==1 | jbstat==2) /*employed*/ -replace lesnr_c2 = 2 if jbstat==3 | jbstat==5 | jbstat==6 | jbstat==8 | jbstat==9 | jbstat==10 | jbstat==11 | jbstat==14 | jbstat==97 +gen lesnr_c2 = -9 +replace lesnr_c2 = 1 if les_c3==1 +replace lesnr_c2 = 2 if les_c3==2 | les_c3==3 lab var lesnr_c2 "Not retired work status" lab define lesnr_c2 1 "in work" 2 "not in work" lab val lesnr_c2 lesnr_c2 @@ -1074,22 +1095,22 @@ lab val lesnr_c2 lesnr_c2 /************************Exited parental home*********************************/ /*Generated from fnspid and/or mnspid. 1 means that individual no longer lives with a parent (fnspid & mnspid is equal to missing) - when in the previous wave they lived with a parent (fnspid or mnspid not equal to missing).*/ -/* -bysort swv: fre mnspid if mnspid<=0 -bysort swv: fre fnspid if fnspid<=0 -bysort swv: fre mnspid if mnspid>=. -bysort swv: fre fnspid if fnspid>=. + when in the previous wave they lived with a parent (fnspid or mnspid not equal to missing). +NOTE: Leaving the parental home was synchronised with the definition of adult child; +an individual can leave the parental home unless they are a "responsible adult" (their both parents retired). */ sort idperson swv -gen dlftphm = -9 if (l.fnspid<0 & l.mnspid<0) //those who did not live with parents in the same hh -replace dlftphm=0 if (l.fnspid>0 | l.mnspid>0) //those who lived with at least one parent -replace dlftphm =1 if (fnspid<0 & mnspid<0) & (l.fnspid>0 | l.mnspid>0) //lived with at least one parent but not anymore -bys idperson: replace dlftphm =-9 if _n==1 //this condition will not be applicable for first year in the panel// -la val dlftphm dummy -la var dlftphm "DEMOGRAPHIC: Exited Parental Home" -//bys swv: fre dlftphm - +gen dlftphm = -9 +replace dlftphm = 0 if adultchildflag[_n-1] == 1 & idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +replace dlftphm = 0 if dag == 18 & adultchildflag == 1 +replace dlftphm = 1 if adultchildflag == 0 & adultchildflag[_n-1] == 1 & idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +lab var dlftphm "DEMOGRAPHIC: Exit the Parental Home" +/* +tab dlftphm swv, col +tab dlftphm stm, col +tab dlftphm dun +tab dlftphm adultchildflag +*/ /*********************************Left education*******************************/ sort idperson swv @@ -1148,6 +1169,15 @@ fre dukfr /************************Number of newborn*************************/ +/*NOTE: The approach below was not entirely correct for identifying newborns. +* It defines newborns based on child age (dag <= 1), not on actual birth events. +* As a result, it counts all children aged under one at interview, not just those +* born since the previous wave. The same baby can be counted twice across waves, +* and adopted or stepchildren under one may also be included. +* At the BHPS–UKHLS transition, this method overcounts legacy BHPS infants +* who were already born before the merge but still under one year old in wave B. + +* reported since the last interview, linked to the reporting parent (usually the mother). cap gen child0 = 0 replace child0=1 if dag<=1 @@ -1173,7 +1203,137 @@ replace mother_dchpd=0 if dgn==1 drop dchpd rename mother_dchpd dchpd lab var dchpd "Women's number of newborn children" +*/ + +save "$dir_data\ukhls_pooled_all_obs_02.dta", replace +************************************************************************ +* Number of newborn from "newborn" datasets +************************************************************************ +/*DP: This code uses the UKHLS newborn module, where each row directly represents a birth event (not inferred from child age). +Each record corresponds to a child newly reported since the last interview. We exclude BHPS “legacy” infants in wave B to prevent overcounting at the merge. +- It is more conceptually exact – counts actual reported births, not inferred ones. +- No double-counting across waves – each newborn appears only once. +- Handles BHPS transition properly – avoids inflating wave B with pre-existing BHPS babies (note that in original Cara's SAS code all BHPS newborns were dropped which I think shoudn't happen, + so Cara's version was underestimating number of newborns. +*/ + +* Combine newborn files (b–n) into one long-format dataset +clear + +local firstwave : word 1 of $UKHLS_panel_waves + +* --- Load the first wave --- +use "${dir_ukhls_data}/`firstwave'_newborn.dta", clear +gen swv = "`firstwave'" + +* Remove wave prefix from variable names +local prefix = "`firstwave'_" +foreach var of varlist `firstwave'_* { + local base = subinstr("`var'", "`prefix'", "", .) + rename `var' `base' +} + +* Save as base file +save "${dir_data}/temp_uknbrn.dta", replace + +* --- Append remaining waves --- +foreach w of global UKHLS_panel_waves { + if "`w'" != "`firstwave'" { + di as text "Appending wave `w'..." + use "${dir_ukhls_data}/`w'_newborn.dta", clear + gen swv = "`w'" + + * Remove wave prefix + local prefix = "`w'_" + capture unab prefixed : `w'_* + if _rc == 0 { + foreach var of local prefixed { + local base = subinstr("`var'", "`prefix'", "", .) + rename `var' `base' + } + } + + * Append to the long dataset + append using "${dir_data}/temp_uknbrn.dta" + save "${dir_data}/temp_uknbrn.dta", replace + } +} +//convert wave number to numeric +gen swv_num = . +local i = 1 +foreach w of global UKHLS_panel_waves { + local num : word `i' of $UKHLS_panel_waves_numbers + replace swv_num = `num' if swv == "`w'" + local ++i +} +drop swv +rename swv_num swv +save "${dir_data}/temp_uknbrn.dta", replace + +* Count all genuine newborns (UKHLS + BHPS), excludes BHPS legacy infants in wave B +use "${dir_data}/temp_uknbrn.dta", clear + +keep pidp swv memorig lchlv +keep if lchlv == 1 + +* Define newborn indicator +gen byte nbrn = 0 +* UKHLS-origin respondents (memorig = 1, 2, 7, 8): +* Always count their newborns. These are all part of the original or ethnic minority boost samples. +replace nbrn = 1 if inlist(memorig, 1, 2, 7, 8) +* BHPS-origin respondents (memorig = 3, 4, 5, 6): +* The BHPS sample was integrated into UKHLS starting from wave B (2010–2012). +* Infants recorded at that point include "legacy" BHPS babies already born before +* the merge — not genuine new births within the UKHLS observation window. +** To avoid overcounting these legacy infants, we exclude BHPS-origin newborns +* only in their first UKHLS wave (wave B). From wave C onward, BHPS households +* are fully integrated, so new births are genuine new births and should be counted. +replace nbrn = 1 if inlist(memorig, 3, 4, 5, 6) & swv != 2 + +* Collapse to parent-wave level ==> both parents may report the same child of they are in the same hh +bys pidp swv: egen dchpd = total(nbrn) +label var dchpd "Number of newborn children (UKHLS + BHPS, excl. BHPS legacy infants in wave B)" +bys pidp swv: keep if _n == 1 //(376 observations deleted) +rename pidp idperson +save "${dir_data}/temp_parent_dchpd.dta", replace + +* Merge into main person-wave dataset +use "$dir_data\ukhls_pooled_all_obs_02.dta", clear +merge 1:1 idperson swv using "${dir_data}/temp_parent_dchpd.dta" +keep if _merge ==1 | _merge==3 +drop _merge + +* After merging: fill missing with 0 +replace dchpd = 0 if missing(dchpd) +label var dchpd "Number of newborn children (UKHLS + BHPS, excl. BHPS legacy infants in wave B)" +/*check how many hh reported same newborn twice because both parents are respondents +preserve +* Keep only cases with at least one newborn +keep if dchpd > 0 +* Keep only core identifiers and gender +keep idperson idhh swv dgn dchpd + +* Count households with both male and female respondents reporting newborns +bysort idhh swv: egen hh_births = total(dchpd>0) +bysort idhh swv: egen men_births = total(dchpd>0 & dgn==1) +bysort idhh swv: egen women_births = total(dchpd>0 & dgn==0) + +* Mark households where both genders reported at least one newborn +gen both_parents = (men_births>0 & women_births>0) + +* Summarise how common these are +tab men_births +tab women_births +tab both_parents +*No such cases, new births are reported by women only +restore +*/ + +* Note that for the estimates we will only keep newborns who are reported by mothers, but here we keep all reported newborns for each respondent +tab2 swv dchpd if dgn==1, m row +tab2 swv dchpd if dgn==0 & sprfm==1, m row +tab2 swv dchpd if dgn==0 & sprfm==0, m row /*****************************In educational age range*************************/ gen sedag = 1 if dvage >= 16 & dvage <= 29 @@ -1620,8 +1780,6 @@ foreach var in idhh idperson idpartner idfather idmother dct drgn1 dwt dnc02 dnc } - - *recode missings in weights to zero. foreach var in dimlwt disclwt dimxwt dhhwt { qui recode `var' (.=0) (-9/-1=0) @@ -1657,6 +1815,32 @@ isid idperson idhh swv * save the whole pooled dataset that will be used for regression estimates *******************************************************************************/ save "$dir_data\ukhls_pooled_all_obs_02.dta", replace + + + +/*********************** Run employment history do-files to produce liwwh *******************************/ +* 01_Intdate.do: set up cross-wave file of interview dates +* ==> needed to link previous wave interview date to each respondent*/ +do ${dir_do_emphist}/00_Master_emphist.do + +use "$dir_data\ukhls_pooled_all_obs_02.dta", clear + +merge 1:1 idperson swv using ${dir_data_emphist}/temp_liwwh, keepusing (liwwh) +//This is done analogous to UKMOD input data +drop if _merge==2 +replace liwwh=12 if _merge==1 +replace liwwh=0 if _merge==1 & les_c3 !=1 //assume zero months if not in employment +replace liwwh=-9 if swv==1 + +replace liwwh = liwwh/12 +label var liwwh "Total years in employment since Jan 2007" + +bys swv: fre liwwh if dag<16 +bys swv: fre liwwh if dag>=16 + +drop _merge +save "$dir_data\ukhls_pooled_all_obs_02.dta", replace + cap log close @@ -1666,7 +1850,6 @@ cap log close #delimit ; local files_to_drop father_edu.dta - mother_dchpd.dta mother_edu.dta temp.dta temp_age.dta @@ -1682,7 +1865,8 @@ local files_to_drop temp_ypnb.dta tmp_partnershipDuration.dta temp_dot01.dta - + temp_uknbrn.dta + temp_parent_dchpd.dta ; #delimit cr // cr stands for carriage return diff --git a/input/InitialPopulations/compile/06_reweight_and_slice.do b/input/InitialPopulations/compile/06_reweight_and_slice.do index d94be7bd9..f3e0aa8f6 100644 --- a/input/InitialPopulations/compile/06_reweight_and_slice.do +++ b/input/InitialPopulations/compile/06_reweight_and_slice.do @@ -3,7 +3,7 @@ * WEIGHT ADJUSTMENT TO ACCOUNT FOR USING HOUSEHOLDS WITHOUT MISSING VALUES * * AUTH: Patryk Bronka, Daria Popova, Justin van de Ven -* LAST EDIT: 18 July 2025 DP +* LAST EDIT: 21 Oct 2025 DP * *********************************************************************/ ******************************************************************************** @@ -60,7 +60,7 @@ recode hh_size (1=1) (2=2) (3=3) (4/max=4) , gen(hhsize_cat2) /*Household-level probit. Model probabiltiy of being a complete household conditional on presence of people of certain education age gender combination, marital status and region.*/ -probit complete_hh _Ideh* dcpstcat* ib8.drgn1 i.stm , vce(robust) iterate(20) //i.hhsize_cat2 DP: dropped as otherwise does not converge +probit complete_hh _Ideh* dcpstcat* ib8.drgn1 i.stm , vce(robust) iterate(20) //i.hhsize_cat2, dropped as otherwise does not converge *Predict probability of being a complete household predict pr_comphh diff --git a/input/InitialPopulations/compile/08_wealth_to_ukhls.do b/input/InitialPopulations/compile/08_wealth_to_ukhls.do index 40be09823..511ebb069 100644 --- a/input/InitialPopulations/compile/08_wealth_to_ukhls.do +++ b/input/InitialPopulations/compile/08_wealth_to_ukhls.do @@ -404,10 +404,10 @@ save "population_initial_fs_UK_$yearWealth", replace /************************************************************************************** * clean-up and exit -*************************************************************************************/ +************************************************************************************* #delimit ; local files_to_drop - ukhls_wealthtemp.dta + ukhls_wealthtemp.dta ukhls_wealthtemp1.dta ukhls_wealthtemp2.dta ukhls_wealthtemp3.dta @@ -417,8 +417,8 @@ local files_to_drop foreach file of local files_to_drop { erase "$dir_data/`file'" } - - + +*/ /************************************************************************************** * fin **************************************************************************************/ diff --git a/input/InitialPopulations/compile/09_finalise_input_data.do b/input/InitialPopulations/compile/09_finalise_input_data.do index 8d3d0b202..d261291ad 100644 --- a/input/InitialPopulations/compile/09_finalise_input_data.do +++ b/input/InitialPopulations/compile/09_finalise_input_data.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 18 July 2025 +* LAST UPDATE: 3 Nov 2025 * NOTE: Called from 00_master.do - see master file for further details *************************************************************************************** @@ -181,23 +181,23 @@ forvalues yy = $firstSimYear/$lastSimYear { *limit saved variables keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 /// - yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 /// + yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dchpd dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 /// stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt /// potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress liwwh order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen /// - dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag /// + dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dchpd dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag /// multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress + ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress liwwh recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp /// - dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 /// + dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dchpd dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw l1_lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 /// adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth tot_pen nvmhome need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs carehoursprovidedweekly /// econ_benefits econ_benefits_nonuc econ_benefits_uc /// - formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress (missing=-9) + formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dls dot dot01 unemp financial_distress liwwh (missing=-9) gsort idhh idbenefitunit idperson save "$dir_data/population_initial_UK_$year.dta", replace @@ -207,11 +207,16 @@ forvalues yy = $firstSimYear/$lastSimYear { } cap log close -/**************************************************************************************** + +**************************************************************************************** * finalise *************************************************************************************** #delimit ; local files_to_drop + ukhls_wealthtemp.dta + ukhls_wealthtemp1.dta + ukhls_wealthtemp2.dta + ukhls_wealthtemp3.dta was_wealthdata.dta ; #delimit cr // cr stands for carriage return @@ -219,7 +224,7 @@ local files_to_drop foreach file of local files_to_drop { erase "$dir_data/`file'" } -*/ + *************************************************************************************** * end diff --git a/input/InitialPopulations/compile/10_check_yearly_data.do b/input/InitialPopulations/compile/10_check_yearly_data.do index 9020c467c..1566efb6d 100644 --- a/input/InitialPopulations/compile/10_check_yearly_data.do +++ b/input/InitialPopulations/compile/10_check_yearly_data.do @@ -6,7 +6,7 @@ * COUNTRY: UK * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * AUTHORS: Daria Popova -* LAST UPDATE: 18 July 2025 DP +* LAST UPDATE: 3 Nov 2025 DP * NOTE: Called from 00_master.do - see master file for further details ***************************************************************************************/* set matsize 11000, permanently @@ -52,7 +52,8 @@ dcpagdf ynbcpdf_dv der sedag -sprfm +sprfm +dchpd dagsp dehsp_c3 dhesp @@ -96,6 +97,7 @@ unemp dls financial_distress carehoursprovidedweekly +liwwh ; #delimit cr // cr stands for carriage return @@ -135,8 +137,7 @@ idfather pno swv dgn -dag -dcpst +dag dnc02 dnc ded @@ -156,7 +157,8 @@ dcpagdf ynbcpdf_dv der sedag -sprfm +sprfm +dchpd dagsp stm dhm @@ -169,7 +171,6 @@ multiplier dwt dcpst_1 dcpst_2 -dcpst_3 deh_c3_1 deh_c3_2 deh_c3_3 @@ -257,6 +258,7 @@ unemp dls financial_distress carehoursprovidedweekly +liwwh ; #delimit cr // cr stands for carriage return @@ -302,7 +304,7 @@ qui sum `varlist2' , de outreg2 using "$dir_data/population_initial_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2') } -/* + ********************************************************************** *output summary stats for new initial populations before dropping hhs* ********************************************************************** @@ -346,40 +348,10 @@ qui sum `varlist2' , de outreg2 using "$dir_data/population_initial_fs_UK_sumstats.xls" if stm==`year', sum(log) append cttop(`year') keep (`varlist2') } -*/ + cap erase "$dir_data/population_initial_UK_sumstats.txt" cap erase "$dir_data/population_initial_fs_UK_sumstats.txt" cap log close -/* -************************************************************* -*clean up new initial populations - keep only required vars * -************************************************************* -forvalues year=2010/2023 { -insheet using "$dir_data/population_initial_UK_`year'.csv", clear - - *limit saved variables - keep idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 /// - yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 /// - stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag multiplier dwt /// - potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dot dot01 unemp dhe_mcssp dhe_pcssp - - order idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp dcpen /// - dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 adultchildflag /// - multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs formal_socare_cost /// - ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp - - recode idhh idbenefitunit idperson idpartner idmother idfather pno swv dgn dag dcpst dnc02 dnc ded deh_c3 sedex jbstat les_c3 dlltsd dlltsd01 dhe ydses_c5 yplgrs_dv ypnbihs_dv yptciihs_dv dhhtp_c4 ssscp /// - dcpen dcpyy dcpex dcpagdf ynbcpdf_dv der sedag sprfm dagsp dehsp_c3 dhesp lessp_c3 dehm_c3 dehf_c3 stm lesdf_c4 ppno dhm scghq2_dv dhh_owned lhw drgn1 dct dwt_sampling les_c4 dhm_ghq lessp_c4 /// - adultchildflag multiplier dwt potential_earnings_hourly l1_potential_earnings_hourly liquid_wealth need_socare formal_socare_hrs partner_socare_hrs daughter_socare_hrs son_socare_hrs other_socare_hrs /// - formal_socare_cost ypncp ypnoab aidhrs carewho dhe_mcs dhe_pcs dhe_mcssp dhe_pcssp dot dot01 unemp (missing=-9) - - gsort idhh idbenefitunit idperson - save "$dir_data/population_initial_UK_`year'.dta", replace - export delimited using "$dir_data/population_initial_UK_`year'.csv", nolabel replace -} -*/ - diff --git a/input/InitialPopulations/compile/RegressionEstimates/master.do b/input/InitialPopulations/compile/RegressionEstimates/master.do index 373b7f3f8..4e4337a13 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/master.do +++ b/input/InitialPopulations/compile/RegressionEstimates/master.do @@ -8,7 +8,7 @@ * DATA: UKHLS EUL version - UKDA-6614-stata [to wave n] * * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 1 July 2025 DP +* LAST UPDATE: 5 Nov 2025 DP *************************************************************************************** *************************************************************************************** @@ -47,7 +47,7 @@ set matsize 1000 **************************************************************************************/ * Working directory -global dir_work "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates" +global dir_work "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\regression_estimates" * Directory which contains do files global dir_do "${dir_work}/do" @@ -62,26 +62,26 @@ global dir_raw_results "${dir_work}/raw_results" global dir_results "${dir_work}/results" * Directory which contains pooled dataset for estimates -global dir_ukhls_data "D:\Dasha\ESSEX\ESPON 2024\UK\initial_populations\data" +global dir_ukhls_data "D:\Dasha\ESSEX\_SimPaths\_SimPaths_UK\initial_populations\data" * Directory containing external input data global dir_external_data "$dir_work/external_data" * Directory containing results of comparison of various weights -global weight_checks "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\weight_checks" +global weight_checks "${dir_work}/weight_checks" *********************Internal validation**************************************** * Directory to save data for internal validation -global dir_validation_data "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\data" +global dir_validation_data "${dir_work}/internal_validation/data" * Directory for internal validation do-files -global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files" +global dir_do_validation "${dir_work}/internal_validation/do_files" * Directory for internal validation do-files -global dir_do_validation "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\do_files" +global dir_do_validation "${dir_work}/internal_validation/do_files" * Directory for internal validation do-files -global dir_validation_graphs "D:\Dasha\ESSEX\ESPON 2024\UK\regression_estimates\internal_validation\graphs" +global dir_validation_graphs "${dir_work}/internal_validation/graphs" global countyy "UK" @@ -108,9 +108,7 @@ do "${dir_do}/reg_wages.do" do "${dir_do}/reg_income.do" - - -/******************************************************************************* +******************************************************************************* * INTERNAL VALIDATION FILES ****************************************************************************** @@ -131,7 +129,7 @@ do "$dir_do_validation/int_val_retirement.do" do "$dir_do_validation/int_val_wages.do" do "$dir_do_validation/int_val_income.do" -*/ + /************************************************************************************** * END OF FILE **************************************************************************************/ diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do index 95ed15194..c40bd79b5 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_fertility.do @@ -1,9 +1,9 @@ -******************************************************************************** +********************************************************************************* * PROJECT: ESPON * SECTION: Fertility * OBJECT: Final Probit Models * AUTHORS: Daria Popova, Justin van de Ven -* LAST UPDATE: 26 Aug 2025 DP +* LAST UPDATE: 21 Oct 2025 DP * COUNTRY: UK * * NOTES: Simplified the fertility process for those in this initial @@ -37,7 +37,7 @@ putexcel set "$dir_results/reg_fertility", sheet("Info") replace putexcel A1 = "Description:" putexcel B1 = "Model parameters governing projection of fertility" putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" -putexcel A3 = "Last edit: 1 July 2025 DP" +putexcel A3 = "Last edit: 3 Nov 2025 DP" putexcel A4 = "Process:", bold putexcel B4 = "Description:", bold @@ -49,7 +49,7 @@ putexcel B6 = "Probit regression estimates of probability of having a child for putexcel A10 = "Notes:", bold putexcel B10 = "All processes: replaced dhe with dhe_pcs and dhe_mcs, added ethnicity-4 cat (dot), covid dummies (y2020 y2021)" putexcel B11 = "F1a: only 24 obs having a child when in initial education spell, therefore have to take away some covariates to obtain estimate" - +putexcel B12 = "All processes: replaced dcpst with a dummy version (1=partnered 2=single)" putexcel set "$dir_results/reg_fertility", sheet("Gof") modify putexcel A1 = "Goodness of fit", bold @@ -63,9 +63,11 @@ xtset idperson swv * Process F1a: Probabiltiy of having a child * Sample: Women aged 18-44, in initial education spell education. * DV: New born child dummy (note that in the estimation sample dchpd contains the number of newborn children, which could be >1) - +tab sprfm dgn replace dchpd=1 if dchpd>1 & dchpd<. -// only 69 ppl meet the condition in total +replace dchpd = 0 if dchpd==-9 +tab2 swv dchpd, row + tab dchpd if (sprfm == 1 & ded == 1) /*///////////////////////////////////////////////////////////////////////////////////////////////// @@ -86,10 +88,9 @@ erase "${weight_checks}/weight_comparison_F1a.txt" //////////////////////////////////////////////////////////////////////////////////////////////////// */ -probit dchpd dag /*dhe dhe_mcs dhe_pcs*/ ib1.dcpst stm /*y2020 y2021*/ i.dot if /// +probit dchpd Dag /*dhe dhe_mcs dhe_pcs li.Dcpst_Single*/ Year_transformed /*y2020 y2021*/ Ethn_Asian Ethn_Black Ethn_Other if /// sprfm == 1 & ded == 1 [pweight=dimxwt], vce(robust) - * raw results matrix results = r(table) matrix results = results[1..6,1...]' @@ -112,95 +113,77 @@ scalar N = e(N) scalar chi2 = e(chi2) scalar ll = e(ll) +* Store results in Excel -* Results -* Note: Zeros eliminated - +* Store estimates matrix b = e(b) matrix V = e(V) +mata: + // Call matrices into mata + V = st_matrix("V") + b = st_matrix("b") -* Store variance-covariance matrix - -preserve - -putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) - -import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear - -describe -local no_vars = `r(k)' + // Find which coefficients are nonzero + keep = (b :!= 0) -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} + // Eliminate zeros + b_trimmed = select(b, keep) + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + // Inspection + b_trimmed + V_trimmed -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_fertility", sheet("UK_F1a") modify -putexcel C2 = matrix(var) - -restore + // Return to Stata + st_matrix("b_trimmed", b_trimmed') + st_matrix("V_trimmed", V_trimmed) + st_matrix("nonzero_b_flag", keep) +end +* Export into Excel +putexcel set "$dir_results/reg_fertility", sheet("F1a") modify +putexcel B2 = matrix(b_trimmed) +putexcel C2 = matrix(V_trimmed) -* Store estimated coefficients -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} - -putexcel set "$dir_results/reg_fertility", sheet("UK_F1a") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dag Year_transformed Ethn_Asian Ethn_Black Ethn_Other Constant + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" +local i = 1 +foreach var in `var_list' { + local ++i + putexcel A`i' = "`var'" -* Labelling +} -putexcel A1 = "REGRESSOR" -putexcel A2 = "Dag" -putexcel A3 = "Dcpst_Single" -putexcel A4 = "Year_transformed" -putexcel A5 = "Ethn_Black" -putexcel A6 = "Ethn_Other" -putexcel A7 = "Constant" +local i = 2 +foreach var in `var_list' { + local ++i -putexcel B1 = "COEFFICIENT" -putexcel C1 = "Dag" -putexcel D1 = "Dcpst_Single" -putexcel E1 = "Year_transformed" -putexcel F1 = "Ethn_Black" -putexcel G1 = "Ethn_Other" -putexcel H1 = "Constant" + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} - -* Goodness of fit +* Export model fit statistics putexcel set "$dir_results/reg_fertility", sheet("Gof") modify -putexcel A3 = "F1a - Fertility in initial education spell", bold +putexcel A9 = "F1a - Fertility, in initial education spell", bold putexcel A5 = "Pseudo R-squared" putexcel B5 = r2_p @@ -212,7 +195,10 @@ putexcel E6 = "Log likelihood" putexcel F6 = ll drop in_sample p -scalar drop r2_p N chi2 ll +scalar drop r2_p N chi2 ll + + + ************************************************ * F1b - Having a child, left initial edu spell * @@ -245,9 +231,17 @@ erase "${weight_checks}/weight_comparison_F1b.txt" //////////////////////////////////////////////////////////////////////////////////////////////////// */ -probit dchpd dag dagsq li.ydses_c5 l.dnc l.dnc02 /*ib1.dhe*/ dhe_pcs dhe_mcs /*ib1.dcpst*/ /// - lib1.dcpst ib1.deh_c3 dukfr li.les_c3 ib8.drgn1 stm y2020 y2021 i.dot if /// - (sprfm == 1 & ded == 0) [pweight=dimxwt], vce(robust) +probit dchpd Dag Dag_sq Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + Dnc_L1 Dnc02_L1 /// + Dhe_pcs Dhe_mcs /// + Dcpst_Single_L1 /// + Deh_c3_Medium Deh_c3_Low /// + FertilityRate /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other /// +if (sprfm == 1 & ded == 0) [pweight = dimxwt], vce(robust) + * raw results matrix results = r(table) @@ -271,151 +265,84 @@ scalar N = e(N) scalar chi2 = e(chi2) scalar ll = e(ll) - -* Results -* Note: Zeros eliminated - -matrix b = e(b) -matrix V = e(V) - -* Store variance-covariance matrix +* Store results in Excel -preserve - -putexcel set "$dir_raw_results/fertility/var_cov", sheet("var_cov") replace -putexcel A1 = matrix(V) +* Store estimates +matrix b = e(b) +matrix V = e(V) -import excel "$dir_raw_results/fertility/var_cov", sheet("var_cov") clear +mata: + // Call matrices into mata + V = st_matrix("V") + b = st_matrix("b") -describe -local no_vars = `r(k)' + // Find which coefficients are nonzero + keep = (b :!= 0) -forvalues i = 1/2 { - egen row_sum = rowtotal(*) - drop if row_sum == 0 - drop row_sum - xpose, clear -} + // Eliminate zeros + b_trimmed = select(b, keep) + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + // Inspection + b_trimmed + V_trimmed -mkmat v*, matrix(var) -putexcel set "$dir_results/reg_fertility", sheet("UK_F1b") modify -putexcel C2 = matrix(var) - -restore + // Return to Stata + st_matrix("b_trimmed", b_trimmed') + st_matrix("V_trimmed", V_trimmed) + st_matrix("nonzero_b_flag", keep) +end +* Export into Excel +putexcel set "$dir_results/reg_fertility", sheet("F1b") modify +putexcel B2 = matrix(b_trimmed) +putexcel C2 = matrix(V_trimmed) -* Store estimated coefficients +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dag Dag_sq Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + Dnc_L1 Dnc02_L1 /// + Dhe_pcs Dhe_mcs /// + Dcpst_Single_L1 /// + Deh_c3_Medium Deh_c3_Low /// + FertilityRate /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other Constant + + +putexcel A1 = "REGRESSOR" +putexcel B1 = "COEFFICIENT" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b +local i = 2 +foreach var in `var_list' { + local ++i -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" } -} - -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) - -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ } } -putexcel set "$dir_results/reg_fertility", sheet("UK_F1b") modify -putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) - - -* Labelling - -putexcel A1 = "REGRESSOR" -putexcel A2 = "Dag" -putexcel A3 = "Dag_sq" -putexcel A4 = "Ydses_c5_Q2_L1" -putexcel A5 = "Ydses_c5_Q3_L1" -putexcel A6 = "Ydses_c5_Q4_L1" -putexcel A7 = "Ydses_c5_Q5_L1" -putexcel A8 = "Dnc_L1" -putexcel A9 = "Dnc02_L1" -putexcel A10 = "Dhe_pcs" -putexcel A11 = "Dhe_mcs" -putexcel A12 = "Dcpst_Single_L1" -putexcel A13 = "Dcpst_PreviouslyPartnered_L1" -putexcel A14 = "Deh_c3_Medium" -putexcel A15 = "Deh_c3_Low" -putexcel A16 = "FertilityRate" -putexcel A17 = "Les_c3_Student_L1" -putexcel A18 = "Les_c3_NotEmployed_L1" -putexcel A19 = "UKC" -putexcel A20 = "UKD" -putexcel A21 = "UKE" -putexcel A22 = "UKF" -putexcel A23 = "UKG" -putexcel A24 = "UKH" -putexcel A25 = "UKJ" -putexcel A26 = "UKK" -putexcel A27 = "UKL" -putexcel A28 = "UKM" -putexcel A29 = "UKN" -putexcel A30 = "Year_transformed" -putexcel A31 = "Y2020" -putexcel A32 = "Y2021" -putexcel A33 = "Ethn_Asian" -putexcel A34 = "Ethn_Black" -putexcel A35 = "Ethn_Other" -putexcel A36 = "Constant" - -putexcel B1 = "COEFFICIENT" -putexcel C1 = "Dag" -putexcel D1 = "Dag_sq" -putexcel E1 = "Ydses_c5_Q2_L1" -putexcel F1 = "Ydses_c5_Q3_L1" -putexcel G1 = "Ydses_c5_Q4_L1" -putexcel H1 = "Ydses_c5_Q5_L1" -putexcel I1 = "Dnc_L1" -putexcel J1 = "Dnc02_L1" -putexcel K1 = "Dhe_pcs" -putexcel L1 = "Dhe_mcs" -putexcel M1 = "Dcpst_Single_L1" -putexcel N1 = "Dcpst_PreviouslyPartnered_L1" -putexcel O1 = "Deh_c3_Medium" -putexcel P1 = "Deh_c3_Low" -putexcel Q1 = "FertilityRate" -putexcel R1 = "Les_c3_Student_L1" -putexcel S1 = "Les_c3_NotEmployed_L1" -putexcel T1 = "UKC" -putexcel U1 = "UKD" -putexcel V1 = "UKE" -putexcel W1 = "UKF" -putexcel X1 = "UKG" -putexcel Y1 = "UKH" -putexcel Z1 = "UKJ" -putexcel AA1 = "UKK" -putexcel AB1 = "UKL" -putexcel AC1 = "UKM" -putexcel AD1 = "UKN" -putexcel AE1 = "Year_transformed" -putexcel AF1 = "Y2020" -putexcel AG1 = "Y2021" -putexcel AH1 = "Ethn_Asian" -putexcel AI1 = "Ethn_Black" -putexcel AJ1 = "Ethn_Other" -putexcel AK1 = "Constant" - - -* Goodness of fit - +* Export model fit statistics putexcel set "$dir_results/reg_fertility", sheet("Gof") modify -putexcel A9 = "F1b - Fertility left initial education spell", bold +putexcel A9 = "F1b - Fertility, left initial education spell", bold putexcel A11 = "Pseudo R-squared" putexcel B11 = r2_p @@ -428,7 +355,7 @@ putexcel F12 = ll drop in_sample p scalar drop r2_p N chi2 ll - - + + capture log close diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do index 427c70485..d75449cd7 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_home_ownership.do @@ -26,12 +26,11 @@ use "$dir_ukhls_data/ukhls_pooled_all_obs_09.dta", clear do "$dir_do/variable_update" -*sample selection +/*sample selection drop if dag < 16 - xtset idperson swv - +*/ * Set Excel file @@ -41,7 +40,7 @@ putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace putexcel A1 = "Description:" putexcel B1 = "Model parameters governing projection of home ownership" putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" -putexcel A3 = "Last edit: 1 July 2025 DP" +putexcel A3 = "Last edit: 4 Nov 2025 DP" putexcel A4 = "Process:", bold putexcel B4 = "Description:", bold @@ -51,6 +50,7 @@ putexcel B5 = "Probit regression estimates of the probability of being a home ow putexcel A10 = "Notes:", bold putexcel B10 = "Have combined dhhtp_c4 and lessp_c3 into a single variable with 8 categories, dhhtp_c8" putexcel B11 = "Added lagged home ownership, replaced dhe with dhe_pcs and dhe_mcs, added ethnicity (dot) and covid dummies (y2020 2021)" +putexcel B12 = "Re-estimated process at benefit unit level to be consistent with SimPaths" putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify putexcel A1 = "Goodness of fit", bold @@ -61,12 +61,13 @@ putexcel A1 = "Goodness of fit", bold ************************ * Process HO1a: Probability of being a home owner -* Sample: Individuals aged 18+ +* Sample: Individuals aged 18+ who are benefit unit heads * DV: Home ownerhip dummy +/* fre dhh_owned if dag >= 18 -/*///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// //check weights ////////////////////////////////////////////////////////////////////////////////// probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// @@ -85,12 +86,97 @@ outreg2 using "${weight_checks}/weight_comparison_HO1a.xls", alpha(0.001, 0.01, erase "${weight_checks}/weight_comparison_HO1a.txt" //////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// -*/ + probit dhh_owned dgn dag dagsq il.dhhtp_c8 il.les_c3 /// i.deh_c3 /*il.dhe*/ l.dhe_mcs l.dhe_pcs il.ydses_c5 l.yptciihs_dv l.dhh_owned ib8.drgn1 stm y2020 y2021 i.dot if /// dag>=18 [pweight=dimxwt], vce(cluster idperson) +*/ + +* DEFINE BENEFIT UNIT HEAD (AGED 18+) + +* Keep adults (18+) +keep if dag >= 18 + + +* Count unique benefit-unit–wave combinations BEFORE head selection +egen tag_bu_wave = tag(idbenefitunit swv) +count if tag_bu_wave +local n_bu_before = r(N) +display "Number of benefit unit–wave combinations BEFORE selecting head: `n_bu_before'" + + +* Sort benefit unit members within each wave: +* 1. Highest non-benefit income (ypnbihs_dv) +* 2. Highest age (dag) +* 3. Lowest idperson (idperson) +gsort idbenefitunit swv -ypnbihs_dv -dag idperson + +* Tag the first person (the "head") per benefit unit and wave +bysort idbenefitunit swv: gen benunit_head = (_n == 1) + +* Keep only benefit unit heads +keep if benunit_head == 1 + +* Count unique benefit-unit–wave combinations AFTER head selection +drop tag_bu_wave +egen tag_bu_wave = tag(idbenefitunit swv) +count if tag_bu_wave +local n_bu_after = r(N) +display "Number of benefit unit–wave combinations AFTER selecting head: `n_bu_after'" + +* Ensure benefit unit–wave counts match before and after head selection +assert `n_bu_before' == `n_bu_after' + +* Verify only one head per benefit unit per wave +by idbenefitunit swv, sort: gen n=_N +assert n==1 + +* Declare panel +xtset idperson swv + + +******************************************************************************** +* SET EXCEL OUTPUT FILES +******************************************************************************** + +* Info sheet +putexcel set "$dir_results/reg_home_ownership", sheet("Info") replace +putexcel A1 = "Description:" +putexcel B1 = "Model parameters governing projection of home ownership" +putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" +putexcel A3 = "Last edit: 4 Nov 2025 DP" +putexcel A4 = "Process:", bold +putexcel B4 = "Description:", bold +putexcel A5 = "HO1a" +putexcel B5 = "Probit regression estimates of the probability of being a home owner, benefit unit heads aged 18+" + +putexcel A10 = "Notes:", bold +putexcel B10 = "Have combined dhhtp_c4 and lessp_c3 into a single variable with 8 categories, dhhtp_c8" +putexcel B11 = "Added lagged home ownership, replaced dhe with dhe_pcs and dhe_mcs, added ethnicity (dot) and covid dummies (y2020, y2021)" +putexcel B12 = "Re-estimated process at benefit unit level using heads defined by highest personal non-benefit income, or age, or lowest idperson" + +putexcel set "$dir_results/reg_home_ownership", sheet("Gof") modify +putexcel A1 = "Goodness of fit", bold + + +******************************************************************************** +* HO1a: Home ownership +******************************************************************************** + +probit dhh_owned Dgn Dag Dag_sq /// + Dhhtp_c8_2_L1 Dhhtp_c8_3_L1 Dhhtp_c8_4_L1 Dhhtp_c8_5_L1 Dhhtp_c8_6_L1 Dhhtp_c8_7_L1 Dhhtp_c8_8_L1 /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + Deh_c3_Medium Deh_c3_Low /// + Dhe_mcs_L1 Dhe_pcs_L1 /// + Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + Yptciihs_dv_L1 /// + Dhh_owned_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other /// + [pweight = dimxwt], vce(cluster idperson) + * raw results matrix results = r(table) @@ -177,90 +263,45 @@ putexcel A1 = matrix(nonzero_b'), names nformat(number_d2) * Labelling - +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq /// + Dhhtp_c8_2_L1 Dhhtp_c8_3_L1 Dhhtp_c8_4_L1 Dhhtp_c8_5_L1 Dhhtp_c8_6_L1 Dhhtp_c8_7_L1 Dhhtp_c8_8_L1 /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + Deh_c3_Medium Deh_c3_Low /// + Dhe_mcs_L1 Dhe_pcs_L1 /// + Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + Yptciihs_dv_L1 /// + Dhh_owned_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other /// + Constant + + putexcel A1 = "REGRESSOR" -putexcel A2 = "Dgn" -putexcel A3 = "Dag" -putexcel A4 = "Dag_sq" -putexcel A5 = "Dhhtp_c8_2_L1" -putexcel A6 = "Dhhtp_c8_3_L1" -putexcel A7 = "Dhhtp_c8_4_L1" -putexcel A8 = "Dhhtp_c8_5_L1" -putexcel A9 = "Dhhtp_c8_6_L1" -putexcel A10 = "Dhhtp_c8_7_L1" -putexcel A11 = "Dhhtp_c8_8_L1" -putexcel A12 = "Les_c3_Student_L1" -putexcel A13 = "Les_c3_NotEmployed_L1" -putexcel A14 = "Deh_c3_Medium" -putexcel A15 = "Deh_c3_Low" -putexcel A16 = "Dhe_mcs" -putexcel A17 = "Dhe_pcs" -putexcel A18 = "Ydses_c5_Q2_L1" -putexcel A19 = "Ydses_c5_Q3_L1" -putexcel A20 = "Ydses_c5_Q4_L1" -putexcel A21 = "Ydses_c5_Q5_L1" -putexcel A22 = "Yptciihs_dv_L1" -putexcel A23 = "Dhh_owned_L1" -putexcel A24 = "UKC" -putexcel A25 = "UKD" -putexcel A26 = "UKE" -putexcel A27 = "UKF" -putexcel A28 = "UKG" -putexcel A29 = "UKH" -putexcel A30 = "UKJ" -putexcel A31 = "UKK" -putexcel A32 = "UKL" -putexcel A33 = "UKM" -putexcel A34 = "UKN" -putexcel A35 = "Year_transformed" -putexcel A36 = "Y2020" -putexcel A37 = "Y2021" -putexcel A38 = "Ethn_Asian" -putexcel A39 = "Ethn_Black" -putexcel A40 = "Ethn_Other" -putexcel A41 = "Constant" - putexcel B1 = "COEFFICIENT" -putexcel C1 = "Dgn" -putexcel D1 = "Dag" -putexcel E1 = "Dag_sq" -putexcel F1 = "Dhhtp_c8_2_L1" -putexcel G1 = "Dhhtp_c8_3_L1" -putexcel H1 = "Dhhtp_c8_4_L1" -putexcel I1 = "Dhhtp_c8_5_L1" -putexcel J1 = "Dhhtp_c8_6_L1" -putexcel K1 = "Dhhtp_c8_7_L1" -putexcel L1 = "Dhhtp_c8_8_L1" -putexcel M1 = "Les_c3_Student_L1" -putexcel N1 = "Les_c3_NotEmployed_L1" -putexcel O1 = "Deh_c3_Medium" -putexcel P1 = "Deh_c3_Low" -putexcel Q1 = "Dhe_mcs" -putexcel R1 = "Dhe_pcs" -putexcel S1 = "Ydses_c5_Q2_L1" -putexcel T1 = "Ydses_c5_Q3_L1" -putexcel U1 = "Ydses_c5_Q4_L1" -putexcel V1 = "Ydses_c5_Q5_L1" -putexcel W1 = "Yptciihs_dv_L1" -putexcel X1 = "Dhh_owned_L1" -putexcel Y1 = "UKC" -putexcel Z1 = "UKD" -putexcel AA1 = "UKE" -putexcel AB1 = "UKF" -putexcel AC1 = "UKG" -putexcel AD1 = "UKH" -putexcel AE1 = "UKJ" -putexcel AF1 = "UKK" -putexcel AG1 = "UKL" -putexcel AH1 = "UKM" -putexcel AI1 = "UKN" -putexcel AJ1 = "Year_transformed" -putexcel AK1 = "Y2020" -putexcel AL1 = "Y2021" -putexcel AM1 = "Ethn_Asian" -putexcel AN1 = "Ethn_Black" -putexcel AO1 = "Ethn_Other" -putexcel AP1 = "Constant" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} * Goodness of fit diff --git a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do index 9a852ab65..9c37f0f22 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do +++ b/input/InitialPopulations/compile/RegressionEstimates/reg_leaveParentalHome.do @@ -41,7 +41,7 @@ putexcel set "$dir_work/reg_leaveParentalHome", sheet("Info") replace putexcel A1 = "Description:" putexcel B1 = "Model parameters governing leaving parental home" putexcel A2 = "Authors: Patryk Bronka, Justin van de Ven, Daria Popova" -putexcel A3 = "Last edit: 1 July 2025 DP" +putexcel A3 = "Last edit: 4 Nov 2025 DP" putexcel A4 = "Process:", bold putexcel B4 = "Description:", bold @@ -49,23 +49,26 @@ putexcel A5 = "P1a" putexcel B5 = "Probit regression estimates for leaving the parental home - 18+, not in intitial education spell, living with parents in t-1" putexcel A10 = "Notes:", bold -putexcel B10 = "Added: ethnicity-4 cat (dot); covid dummies (y2020 y2021); not partnered condition (dcpst != 1) to be consistent with the simulation" +putexcel B10 = "Added: ethnicity-4 cat (dot); covid dummies (y2020 y2021)" +putexcel B11 = "DV is synchronised with the adult child definition" putexcel set "$dir_work/reg_leaveParentalHome", sheet("Gof") modify putexcel A1 = "Goodness of fit", bold -************************************ -* Process P1a: Leave Parental Home * -************************************ - +******************************************************************************** +* Process P1a: Leave Parental Home +******************************************************************************** * Process P1a: Probability of leaving the parental home. -* Sample: All respondents living with a parent in t-1, aged 18+, not in initial +* Sample: All respondents adult child in t-1 and not currently in initial * education spell -* DV: Left parental home dummy of those who lived with parents in t-1 -* Note: Added not partnered condition as well to be consistent with the simulation -fre dlftphm if (ded == 0 & dag >= 18 & dcpst != 1) //3.65% - +* DV: Observed transitioning from adult child to non-adult child + +xtset idperson swv +//fre dlftphm if (ded == 0 & dag >= 18 & dcpst != 1) //3.65% +fre dlftphm if (ded == 0 & dag >= 18 ) +tab2 stm dlftphm if (ded == 0 & dag >= 18), r + /*///////////////////////////////////////////////////////////////////////////////////////////////// //check weights ////////////////////////////////////////////////////////////////////////////////// probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// @@ -84,9 +87,13 @@ erase "${weight_checks}/weight_comparison_P1a.txt" //////////////////////////////////////////////////////////////////////////////////////////////////// */ -probit dlftphm i.dgn dag dagsq ib1.deh_c3 li.les_c3 li.ydses_c5 ib8.drgn1 stm y2020 y2021 i.dot /// - if (ded==0 & dag>=18 & l.dlftphm==0 & dcpst != 1) [pweight=dimxwt], vce(robust) - +probit dlftphm Dgn Dag Dag_sq Deh_c3_Medium Deh_c3_Low /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other /// + if (ded == 0 & dag >= 18 /*& dagpns!=1 & les_c4!=4*/ ) [pw = dimxwt], vce(robust) + * save raw results matrix results = r(table) @@ -145,98 +152,76 @@ putexcel C2 = matrix(var) restore -* Store estimated coefficients +* Store results in Excel -// Initialize a counter for non-zero coefficients -local non_zero_count = 0 -//local names : colnames b - -// Loop through each element in `b` to count non-zero coefficients -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - local non_zero_count = `non_zero_count' + 1 - } -} +* Store estimates +matrix b = e(b) +matrix V = e(V) -// Create a new row vector to hold only non-zero coefficients -matrix nonzero_b = J(1, `non_zero_count', .) +mata: + // Call matrices into mata + V = st_matrix("V") + b = st_matrix("b") -// Populate nonzero_b with non-zero coefficients from b -local index = 1 -forvalues i = 1/`no_vars' { - if (b[1, `i'] != 0) { - matrix nonzero_b[1, `index'] = b[1, `i'] - local index = `index' + 1 - } -} + // Find which coefficients are nonzero + keep = (b :!= 0) + + // Eliminate zeros + b_trimmed = select(b, keep) + V_trimmed = select(V, keep) + V_trimmed = select(V_trimmed', keep)' + + // Inspection + b_trimmed + V_trimmed + + // Return to Stata + st_matrix("b_trimmed", b_trimmed') + st_matrix("V_trimmed", V_trimmed) + st_matrix("nonzero_b_flag", keep) +end +* Export into Excel putexcel set "$dir_results/reg_leaveParentalHome", sheet("UK_P1a") modify -putexcel A1 = matrix(nonzero_b'), names //nformat(number_d2) +putexcel B2 = matrix(b_trimmed) +putexcel C2 = matrix(V_trimmed) + +* Labelling +// Need to variable label when add new variable to model. Order matters. +local var_list Dgn Dag Dag_sq /// + Deh_c3_Medium Deh_c3_Low /// + Les_c3_Student_L1 Les_c3_NotEmployed_L1 /// + Ydses_c5_Q2_L1 Ydses_c5_Q3_L1 Ydses_c5_Q4_L1 Ydses_c5_Q5_L1 /// + UKC UKD UKE UKF UKG UKH UKJ UKK UKL UKM UKN /// + Year_transformed Y2020 Y2021 Ethn_Asian Ethn_Black Ethn_Other /// + Constant -* Labeling - putexcel A1 = "REGRESSOR" -putexcel A2 = "Dgn" -putexcel A3 = "Dag" -putexcel A4 = "Dag_sq" -putexcel A5 = "Deh_c3_Medium" -putexcel A6 = "Deh_c3_Low" -putexcel A7 = "Les_c3_Student_L1" -putexcel A8 = "Les_c3_NotEmployed_L1" -putexcel A9 = "Ydses_c5_Q2_L1" -putexcel A10 = "Ydses_c5_Q3_L1" -putexcel A11 = "Ydses_c5_Q4_L1" -putexcel A12 = "Ydses_c5_Q5_L1" -putexcel A13 = "UKC" -putexcel A14 = "UKD" -putexcel A15 = "UKE" -putexcel A16 = "UKF" -putexcel A17 = "UKG" -putexcel A18 = "UKH" -putexcel A19 = "UKJ" -putexcel A20 = "UKK" -putexcel A21 = "UKL" -putexcel A22 = "UKM" -putexcel A23 = "UKN" -putexcel A24 = "Year_transformed" -putexcel A25 = "Y2020" -putexcel A26 = "Y2021" -putexcel A27 = "Ethn_Asian" -putexcel A28 = "Ethn_Black" -putexcel A29 = "Ethn_Other" -putexcel A30 = "Constant" - putexcel B1 = "COEFFICIENT" -putexcel C1 = "Dgn" -putexcel D1 = "Dag" -putexcel E1 = "Dag_sq" -putexcel F1 = "Deh_c3_Medium" -putexcel G1 = "Deh_c3_Low" -putexcel H1 = "Les_c3_Student_L1" -putexcel I1 = "Les_c3_NotEmployed_L1" -putexcel J1 = "Ydses_c5_Q2_L1" -putexcel K1 = "Ydses_c5_Q3_L1" -putexcel L1 = "Ydses_c5_Q4_L1" -putexcel M1 = "Ydses_c5_Q5_L1" -putexcel N1 = "UKC" -putexcel O1 = "UKD" -putexcel P1 = "UKE" -putexcel Q1 = "UKF" -putexcel R1 = "UKG" -putexcel S1 = "UKH" -putexcel T1 = "UKJ" -putexcel U1 = "UKK" -putexcel V1 = "UKL" -putexcel W1 = "UKM" -putexcel X1 = "UKN" -putexcel Y1 = "Year_transformed" -putexcel Z1 = "Y2020" -putexcel AA1 = "Y2021" -putexcel AB1 = "Ethn_Asian" -putexcel AC1 = "Ethn_Black" -putexcel AD1 = "Ethn_Other" -putexcel AE1 = "Constant" + +local i = 1 +foreach var in `var_list' { + local ++i + + putexcel A`i' = "`var'" + +} + +local i = 2 +foreach var in `var_list' { + local ++i + + if `i' <= 26 { + local letter = char(64 + `i') // Convert 1=A, 2=B, ..., 26=Z + putexcel `letter'1 = "`var'" + } + else { + local first = char(64 + int((`i' - 1) / 26)) // First letter: A-Z + local second = char(65 + mod((`i' - 1), 26)) // Second letter: A-Z + putexcel `first'`second'1 = "`var'" // Correctly places AA-ZZ + } +} * Goodness of fit diff --git a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do index 84ceb66f4..e10372b10 100644 --- a/input/InitialPopulations/compile/RegressionEstimates/variable_update.do +++ b/input/InitialPopulations/compile/RegressionEstimates/variable_update.do @@ -49,9 +49,18 @@ replace dhhtp_c8 = 5 if dhhtp_c4 == 2 & lessp_c3 == 2 replace dhhtp_c8 = 6 if dhhtp_c4 == 2 & lessp_c3 == 3 replace dhhtp_c8 = 7 if dhhtp_c4 == 3 replace dhhtp_c8 = 8 if dhhtp_c4 == 4 -cap label define dhhtp_c8 1 "Couple with no children, spouse employed" 2 "Couple with no children, spouse student" 3 "Couple with no children, spouse not employed" 4 "Couple with children, spouse employed" 5 "Couple with children, spouse student" 6 "Couple with children, spouse not employed" 7 "Single with no children" 8 "Single with children" +cap label define dhhtp_c8 1 "Couple with no children, spouse employed" /// +2 "Couple with no children, spouse student" /// +3 "Couple with no children, spouse not employed" /// +4 "Couple with children, spouse employed" /// +5 "Couple with children, spouse student" /// +6 "Couple with children, spouse not employed" /// +7 "Single with no children" /// +8 "Single with children" label values dhhtp_c8 dhhtp_c8 +tab dhhtp_c8, gen(Dhhtp_c8_) + // Squared income variable cap cap gen ypnbihs_dv_sq = ypnbihs_dv^2 label variable ypnbihs_dv_sq "Personal Non-benefit Gross Income Squared" @@ -89,6 +98,13 @@ cap gen l_dhe_pcs = dhe_pcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n- cap gen l_dhe_mcs = dhe_mcs[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 cap gen l_dlltsd = dlltsd[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 cap gen l_dlltsd01 = dlltsd01[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_dnc = dnc[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_dnc02 = dnc02[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_dcpst = dcpst[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_dhhtp_c8 = dhhtp_c8[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_dhh_owned = dhh_owned[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 +cap gen l_yptciihs_dv = yptciihs_dv[_n-1] if idperson == idperson[_n-1] & swv == swv[_n-1] + 1 + // Fill in missing lags using current values at age 16 gsort +idperson -stm @@ -291,12 +307,25 @@ rename L_Dhhtp_c4_2 Dhhtp_c4_CoupleChildren_L1 rename L_Dhhtp_c4_3 Dhhtp_c4_SingleNoChildren_L1 rename L_Dhhtp_c4_4 Dhhtp_c4_SingleChildren_L1 +tab l_dhhtp_c8, gen(L_Dhhtp_c8_) +forvalues i=1/8 { +rename L_Dhhtp_c8_`i' Dhhtp_c8_`i'_L1 +} + tab dot, gen(dot_) rename dot_1 Ethn_White rename dot_2 Ethn_Asian rename dot_3 Ethn_Black rename dot_4 Ethn_Other +tab dcpst, gen(Dcpst_) +rename Dcpst_1 Dcpst_Partnered +rename Dcpst_2 Dcpst_Single + +tab l_dcpst, gen(L_Dcpst_) +rename L_Dcpst_1 Dcpst_Partnered_L1 +rename L_Dcpst_2 Dcpst_Single_L1 + cap gen Year_transformed = stm @@ -321,5 +350,20 @@ cap gen Dlltsd01 = dlltsd01 cap gen Dlltsd_L1 = l_dlltsd cap gen Dlltsd01_L1 = l_dlltsd01 +cap gen FertilityRate = dukfr + +cap gen Dnc = dnc + +cap gen Dnc02 = dnc02 + +rename l_dnc Dnc_L1 + +rename l_dnc02 Dnc02_L1 + +gen Ypnbihs_dv = ypnbihs_dv +gen Yptciihs_dv = yptciihs_dv +gen Yptciihs_dv_L1 = l_yptciihs_dv +gen Dhh_owned = dhh_owned +gen Dhh_owned_L1 = l_dhh_owned diff --git a/input/InitialPopulations/compile/do_emphist/00_Master_emphist.do b/input/InitialPopulations/compile/do_emphist/00_Master_emphist.do new file mode 100644 index 000000000..1492a7954 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/00_Master_emphist.do @@ -0,0 +1,70 @@ +**************************************************************************************************** +* PROJECT: UKMOD update: construct a UKMOD-UKHLS database from UKHLS dataset +* DO-FILE NAME: 00_Master.do +* DESCRIPTION: Main do-file governing the creation of employment history data +* which is required for the generation of some UKMOD variables +* +* PURPOSE: The code reconstructs each respondent’s employment history month by month +* by combining information from the UKHLS and the older BHPS surveys. +* The scripts rebuild employment history using respondents’ reported current activity and interview dates across waves. +* The process links together: +* - the timing of interviews, +* - reported employment and non-employment spells, and +* - transitions between BHPS and UKHLS for legacy sample members. +* The result is a dataset showing, for every person, whether they were employed in each month +* since Jan 2007. +* +* +* The final output liwwh — the total number of months a person has been employed since January 2007. +* This provides a consistent measure of accumulated work experience over the observation window, +* suitable for use in UKMOD and labour-supply model. +* +* +* NOTES: Potentially the timeline could be extended backwards using data from +* the UKHLS Lifetime Employment Status History modules in Waves 1 and 5 +* which collected retrospective work histories from subsets of respondents. +* A sample scripts by Liam Wright are available but outdated: +* https://www.understandingsociety.ac.uk/documentation/mainstage/syntax/user-deposited-syntax/working-life-histories/ +*********************************************************************************************************** +* UKHLS VERSION: UKDA-6931 Special License version 2009-2023 +* AUTHORS: Nick Buck, Ricky Kanabar, Patryk Bronka, Daria Popova +* LAST REVISION: 30 Oct 2025 DP +*********************************************************************************************************** + +************************************************************************ +* Run sub-scripts +************************************************************************ +cd ${dir_data_emphist} +/* */ +* 01_Intdate.do: set up cross-wave file of interview dates +* ==> needed to link previous wave interview date to each respondent*/ +do ${dir_do_emphist}/01_Intdate.do + + +* 02_Lwintdat.do: create files of previous wave interview dates for waves c-n +* ==> helps align spells across waves for UKHLS respondents +do ${dir_do_emphist}/02_Lwintdat.do + +* 03_Bhps_lintdate.do: get last interview date under BHPS +* ==> also creates previous wave interview dates for wave b +do ${dir_do_emphist}/03_Bhps_lintdate.do + +* 04_Sp0_1_2a.do: create wave-specific spell files for everyone +* ==> each spell = period of employment/non-employment, continuous across months +* ==> Note: This does not pick up all possible variables from employment history, could be modified to pick up additional ones +do ${dir_do_emphist}/04_Sp0_1_2a.do + +* 05_Newentrant1.do: create spell file based on wave of entry (start of first job) +* ==> captures employment history for new entrants; fills gaps where possible +do ${dir_do_emphist}/05_Newentrant1.do + +* 06_Aspells1.do: create file containing all spells across waves +* ==> obtains spell start date from previous spell end date +* ==> fills some missing dates; drops cases with insufficient data +do ${dir_do_emphist}/06_Aspells1.do + + +* 07_Empcal1a.do: create monthly employment calendar ==> used to calculate total months in employment per individual +do ${dir_do_emphist}/07_Empcal1a.do + + diff --git a/input/InitialPopulations/compile/do_emphist/01_Intdate.do b/input/InitialPopulations/compile/do_emphist/01_Intdate.do new file mode 100644 index 000000000..afae4c163 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/01_Intdate.do @@ -0,0 +1,60 @@ +************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS +* FILE: 01_Intdate.do +* +* PURPOSE: +* Creates a *cross-wave file of interview dates* for all waves (a–n) of UKHLS. +* This file is later used in 02_Lwintdate.do to determine the *previous* +* interview date for each respondent. +* +* CONTEXT: +* - Reads xwaveid (cross-wave identifier file) and merges interview date +* variables from each wave’s individual response file (`_indresp`). +* - Excludes proxy interviews (where `ivfio > 1`). +* - Converts interview month/year into a continuous “months since 2009” +* variable (`_mns09`). +* +* OUTPUTS: +* - intdate.dta : combined dataset with interview dates and months-since-2009 +************************************************************************************************* + +cap log close +log using "${dir_log_emphist}/01_Intdate.log", replace + +* The list of waves is defined *globally* in 00_Master.do +local waves $UKHLSwaves // copy global into a local for use here +local n: word count `waves' // number of waves + +******************************************************************** +* MERGE INTERVIEW DATES FROM EACH WAVE +******************************************************************** +use ${dir_ukhls_data}/xwaveid, clear + +forvalues i = 1/`n' { + local w : word `i' of `waves' + + * Merge interview date variables from each wave’s individual response file +merge 1:1 pidp using ${dir_ukhls_data}/`w'_indresp , /// + keepusing(`w'_intdatd_dv `w'_intdatm_dv `w'_intdaty_dv) + + + * Exclude proxy interviews (ivfio > 1 means proxy or non-response) + replace `w'_intdatd_dv = . if `w'_ivfio > 1 + replace `w'_intdatm_dv = . if `w'_ivfio > 1 + replace `w'_intdaty_dv = . if `w'_ivfio > 1 + + drop _merge + + * Compute months since 2009 for timeline consistency + gen `w'_mns09 = 12 * (`w'_intdaty_dv - 2009) + `w'_intdatm_dv /// + if `w'_intdaty_dv > 0 & `w'_intdatm_dv > 0 + + //tab `w'_mns09 +} + + +save intdate, replace + + +clear +cap log close diff --git a/input/InitialPopulations/compile/do_emphist/02_Lwintdat.do b/input/InitialPopulations/compile/do_emphist/02_Lwintdat.do new file mode 100644 index 000000000..881fba914 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/02_Lwintdat.do @@ -0,0 +1,123 @@ +************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS +* FILE: 02_Lwintdate.do +* +* PURPOSE: +* Creates variables identifying the *previous interview date* for each wave +* (from c to n) based on cross-wave interview date information. +* +* CONTEXT: +* - Uses the cross-wave dataset created in 01_Intdate.do. +* - For each wave, finds the respondent’s most recent previous interview +* (if any) and records its date (year, month, day) and wave number. +* - The first two waves (a, b) have no valid "previous" interview, so +* processing starts from wave c. +* +* OUTPUTS: +* - Variables: _lwint, _lintdaty, _lintdatm, _lintdatd +* - Files: _lint.dta (for each wave c–n) +************************************************************************************************* + +cap log close +log using "${dir_log_emphist}/02_Lwintdate.log", replace + +use intdate, clear + + +************************************************************************************************* +* DEFINE WAVES AND ASSOCIATED VARIABLES +************************************************************************************************* +local waves $UKHLSwaves // copy global into a local for use here + + +* Build lists of corresponding variable names for each wave +local rvars +local yvars +local mvars +local dvars + +foreach w of local waves { + local rvars "`rvars' `w'_ivfio" // fieldwork outcome (1 = full interview) + local yvars "`yvars' `w'_intdaty_dv" // interview year + local mvars "`mvars' `w'_intdatm_dv" // interview month + local dvars "`dvars' `w'_intdatd_dv" // interview day +} + +local nwaves : word count `waves' + + +************************************************************************************************* +* CREATE VARIABLES FOR PREVIOUS INTERVIEW DATES +************************************************************************************************* +forvalues w = 3/`nwaves' { // start from wave c + local curwave : word `w' of `waves' // current wave (e.g., "c") + local prevmax = `w' - 1 // number of prior waves + + di as text "Processing wave `curwave' (previous up to wave `prevmax')" + + * Initialise variables for this wave + gen `curwave'_lwint = 0 // previous wave index number + gen `curwave'_lintdaty = -9 // previous interview year + gen `curwave'_lintdatm = -9 // previous interview month + gen `curwave'_lintdatd = -9 // previous interview day + + * Check all earlier waves to find last valid interview + forvalues i = 1/`prevmax' { + local rw : word `i' of `rvars' + local yw : word `i' of `yvars' + local mw : word `i' of `mvars' + local dw : word `i' of `dvars' + + * Replace if respondent was interviewed in both current and earlier wave + replace `curwave'_lwint = `i' if `curwave'_ivfio==1 & `rw'==1 + replace `curwave'_lintdaty = `yw' if `curwave'_ivfio==1 & `rw'==1 + replace `curwave'_lintdatm = `mw' if `curwave'_ivfio==1 & `rw'==1 + replace `curwave'_lintdatd = `dw' if `curwave'_ivfio==1 & `rw'==1 + } +} + + +************************************************************************************************* +* SAVE INTERMEDIATE DATASET WITH ALL WAVES +************************************************************************************************* +save intdate1, replace +drop if memorig==8 // exclude temporary or non-original household members + + +************************************************************************************************* +* EXPORT WAVE-SPECIFIC FILES +************************************************************************************************* +foreach w of local waves { + if inlist("`w'", "a", "b") continue // skip first two waves (no prior interviews) + + di as text "Saving previous interview data for wave `w'..." + + keep if `w'_ivfio==1 // respondents with valid interview + keep pidp `w'_lwint `w'_lintdaty `w'_lintdatm `w'_lintdatd + + save `w'_lint, replace // e.g., "c_lint.dta", "d_lint.dta", etc. + + use intdate1, clear // reload full dataset for next wave + drop if memorig==8 +} + + + +clear +cap log close + + + + + + + + + + + + + + + + diff --git a/input/InitialPopulations/compile/do_emphist/03_Bhps_lintdate.do b/input/InitialPopulations/compile/do_emphist/03_Bhps_lintdate.do new file mode 100644 index 000000000..234b59b8e --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/03_Bhps_lintdate.do @@ -0,0 +1,98 @@ +/************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS & BHPS +* FILE: b_lint.do +* +* PURPOSE: +* Bridges the BHPS and UKHLS panels by identifying the *most recent BHPS interview* +* for each respondent before their first UKHLS interview (wave B). +* +* CONTEXT: +* - The BHPS (1991–2008) sample was incorporated into UKHLS starting in wave B (2009–10). +* - This script links the BHPS interview history to the first UKHLS observation +* so that employment and household histories remain continuous across the two panels. +* - It uses BHPS individual response data (waves L–R) and the combined UKHLS intdate file. +* +* OUTPUTS: +* - bhps_lint.dta : most recent BHPS interview date before UKHLS +* - b_lint.dta : previous interview info for wave B (merged BHPS or wave A) +*************************************************************************************************/ + +cap log close +log using "${dir_log_emphist}/03_Bhps_lintdate.log", replace + +/************************************************************************************************* +* BUILD BHPS LAST INTERVIEW FILE (1991–2008) +*************************************************************************************************/ + +use ${dir_bhps_data}/xwaveid_bh, clear + +gen lwint = 0 +gen lintdatd = 0 +gen lintdatm = 0 +gen lintdaty = 0 + +* Define BHPS waves included (update global once in master file) +local waves $BHPS_waves +local nwaves : word count `waves' + +forvalues i = 1/`nwaves' { + local w : word `i' of `waves' + + * Merge BHPS individual response data for this wave + merge 1:1 pidp using ${dir_bhps_data}/b`w'_indresp, /// + keepusing(b`w'_istrtdatd b`w'_istrtdatm b`w'_istrtdaty b`w'_ivfio) + + * Keep valid (non-proxy) interviews + replace lintdatd = b`w'_istrtdatd if b`w'_ivfio == 1 + replace lintdatm = b`w'_istrtdatm if b`w'_ivfio == 1 + replace lintdaty = b`w'_istrtdaty if b`w'_ivfio == 1 + replace lwint = `i' if b`w'_ivfio == 1 + + drop _merge +} + +keep if lwint > 0 +keep pidp lwint lintdatd lintdatm lintdaty +save bhps_lint, replace + + +/************************************************************************************************* +* LINK BHPS TO UKHLS WAVE B +*************************************************************************************************/ + +use intdate1, clear + +* Merge with BHPS last interview info +merge 1:1 pidp using bhps_lint +drop if _merge == 2 // BHPS-only cases (not in UKHLS) + +* Keep only those with full interviews in wave B +keep if b_ivfio == 1 + +tab memorig + +* Initialise +gen b_lwint = 0 +gen b_lintdaty = -9 +gen b_lintdatm = -9 +gen b_lintdatd = -9 + +* Link to UKHLS wave A (if available) +replace b_lwint = 1 if a_ivfio == 1 +replace b_lintdatd = a_intdatd_dv if a_ivfio == 1 +replace b_lintdatm = a_intdatm_dv if a_ivfio == 1 +replace b_lintdaty = a_intdaty_dv if a_ivfio == 1 + +* Replace with BHPS last interview info where available (merge==3) +replace b_lwint = lwint + 11 if _merge == 3 +replace b_lintdatd = lintdatd if _merge == 3 +replace b_lintdatm = lintdatm if _merge == 3 +replace b_lintdaty = lintdaty if _merge == 3 + +tab b_lwint + +keep pidp b_lwint b_lintdaty b_lintdatm b_lintdatd +save b_lint, replace + +clear +cap log close diff --git a/input/InitialPopulations/compile/do_emphist/04_Sp0_1_2a.do b/input/InitialPopulations/compile/do_emphist/04_Sp0_1_2a.do new file mode 100644 index 000000000..2df340a19 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/04_Sp0_1_2a.do @@ -0,0 +1,223 @@ +/************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS +* FILE: 04_Sp0_1_2a.do +* +* PURPOSE: +* Constructs employment history “spells” for each UKHLS wave (b–n). +* For each wave, it: +* - Identifies employment/non-employment episodes and transitions +* - Determines start and end dates of each spell +* - Produces three datasets: sp0 (initial), sp1 (main), sp2 (reshaped) +* +* CONTEXT: +* - Uses previous interview information from ${wp}lint.dta +* - Requires individual respondent data from ${original_data}/${wp}indresp.dta +* +* OUTPUTS: +* - ${wp}sp0.dta : initial spell definitions +* - ${wp}sp1.dta : continuation spells +* - ${wp}sp2.dta : reshaped multi-episode structure +*************************************************************************************************/ + +cap log close +log using "${dir_log_emphist}/04_Sp0_1_2a.log", replace + + +local wps ${UKHLS_waves_prefixed} +local wvno ${UKHLS_panel_waves_numbers} + +local n : word count `wps' // number of waves to process + + +/************************************************************************************************* +* LOOP THROUGH EACH WAVE +*************************************************************************************************/ + +forvalues i = 1/`n' { + + global wp : word `i' of `wps' // wave prefix (e.g. b_, c_, etc.) + global wv : word `i' of `wvno' // wave numeric label + + di as text "------------------------------------------------------" + di as text "Processing wave ${wp} (numeric ${wv})..." + di as text "------------------------------------------------------" + + + /************************************************************************************************* + * PREPARE INDRESP DATA AND MERGE WITH PREVIOUS INTERVIEW FILE + *************************************************************************************************/ + + use ${dir_ukhls_data}/${wp}indresp.dta, clear + keep if ${wp}ivfio == 1 // keep full interviews only + drop if ${wp}hhorig == 8 // drop non-original HH members + + merge 1:1 pidp using ${wp}lint + drop _merge + + rename ${wp}* * // remove wave prefix + + keep pidp jbsemp jbstat notempchk - nxtst nxtstelse - cjbatt /// + ff_ivlolw ff_emplw ff_jbsemp ff_jbstat intdatd_dv intdatm_dv intdaty_dv /// + lwint lintdaty lintdatm lintdatd + + + /************************************************************************************************* + * DEFINE EMPLOYMENT FLAGS AND END DATE VARIABLES + *************************************************************************************************/ + + gen aehhas = 1 + replace aehhas = 0 if empchk == -8 & notempchk == -8 + keep if aehhas == 1 + + gen enddatestat = 0 + replace enddatestat = 1 if empchk == 1 + replace enddatestat = 2 if notempchk == 1 & empchk != 1 + replace enddatestat = 3 if empchk == 2 + replace enddatestat = 4 if notempchk == 2 & empchk == -8 + replace enddatestat = 1 if enddatestat == 0 & empchk != -8 + replace enddatestat = 2 if enddatestat == 0 & notempchk != -8 + replace enddatestat = 5 if enddatestat == 1 & (jbsamr == 2 | samejob == 2) + + gen endday = intdatd_dv if enddatestat < 3 + gen endmonth = intdatm_dv if enddatestat < 3 + gen endyear = intdaty_dv if enddatestat < 3 + + replace endday = jbendd if enddatestat == 5 + replace endmonth = jbendm if enddatestat == 5 + replace endyear = jbendy4 if enddatestat == 5 + + replace endday = empstendd if inlist(enddatestat, 3, 4) + replace endmonth = empstendm if inlist(enddatestat, 3, 4) + replace endyear = empstendy4 if inlist(enddatestat, 3, 4) + + save ${wp}sp1a, replace // store intermediate version + + + /************************************************************************************************* + * CREATE SPELL 0 DATASET (INITIAL EPISODE) + *************************************************************************************************/ + + gen startday = lintdatd + gen startmonth = lintdatm + gen startyear = lintdaty + gen stdatestat = 1 + + gen espstat = jbstat + replace espstat = 1 if jbsemp == 2 + replace espstat = 2 if jbsemp == 1 + replace espstat = ff_jbstat if enddatestat == 4 + replace espstat = 1 if enddatestat == 3 & ff_jbsemp == 2 + replace espstat = 2 if enddatestat == 3 & ff_jbsemp == 1 + replace espstat = 2 if enddatestat == 5 & espstat > 2 + + gen wave = ${wv} + gen spell = 0 + + keep pidp wave spell lwint - espstat lintdatd lintdatm lintdaty intdatm_dv intdaty_dv + save ${wp}sp0, replace + + + /************************************************************************************************* + * CREATE SPELL 1 DATASET (CONTINUATION EPISODES) + *************************************************************************************************/ + + use ${wp}sp1a, clear + keep if enddatestat > 2 + + rename endday startday + rename endmonth startmonth + rename endyear startyear + gen stdatestat = 2 + + rename enddatestat edstat1 + + * Determine new end dates + gen enddatestat = 0 + replace enddatestat = 1 if cjob == 1 + replace enddatestat = 3 if cjob == 2 + replace enddatestat = 2 if cstat == 2 & enddatestat == 0 + replace enddatestat = 4 if cstat == 1 & enddatestat == 0 + replace enddatestat = 1 if enddatestat == 0 & jbsemp != -8 + replace enddatestat = 2 if enddatestat == 0 & jbsemp == -8 + + gen endday = intdatd_dv if enddatestat < 3 + gen endmonth = intdatm_dv if enddatestat < 3 + gen endyear = intdaty_dv if enddatestat < 3 + + replace endday = nxtjbendd if enddatestat == 3 + replace endmonth = nxtjbendm if enddatestat == 3 + replace endyear = nxtjbendy4 if enddatestat == 3 + + replace endday = nxtstendd if enddatestat == 4 + replace endmonth = nxtstendm if enddatestat == 4 + replace endyear = nxtstendy4 if enddatestat == 4 + + gen espstat = jbstat if enddatestat == 2 + replace espstat = 1 if jbsemp == 2 & enddatestat == 1 + replace espstat = 2 if jbsemp == 1 & enddatestat == 1 + replace espstat = nxtstelse + 2 if enddatestat == 4 & nxtstelse > 0 + replace espstat = nxtstelse if enddatestat == 4 & nxtstelse > -8 & nxtstelse < 0 + replace espstat = 1 if enddatestat == 3 & nxtjbes == 2 + replace espstat = 2 if enddatestat == 3 & nxtjbes > -8 & nxtjbes < 2 + replace espstat = 2 if enddatestat == 1 & missing(espstat) + + gen wave = ${wv} + gen spell = 1 + + keep pidp wave spell lwint startday - espstat lintdatd lintdatm lintdaty intdatm_dv intdaty_dv + save ${wp}sp1, replace + + + /************************************************************************************************* + * CREATE SPELL 2 DATASET (RESHAPED MULTI-EPISODE STRUCTURE) + *************************************************************************************************/ + + use ${dir_ukhls_data}/${wp}indresp.dta, clear + rename ${wp}* * + keep if ivfio == 1 + drop if hhorig == 8 + + keep pidp nextstat* nextelse* currstat* nextjob* currjob* jobhours* statendd* statendm* statendy4* + + reshape long nextstat nextelse currstat nextjob currjob jobhours statendd statendm statendy4, i(pidp) j(sp2) + drop if nextstat == -8 + + quietly merge m:1 pidp using ${dir_ukhls_data}/${wp}indresp, /// + keepusing(${wp}intdatd_dv ${wp}intdatm_dv ${wp}intdaty_dv) + keep if _merge == 3 + drop _merge + + merge m:1 pidp using ${wp}lint + keep if _merge == 3 + drop _merge + rename ${wp}* * + + gen enddatestat = 0 + replace enddatestat = 1 if currjob == 1 + replace enddatestat = 3 if currjob == 2 + replace enddatestat = 4 if currstat == 1 + replace enddatestat = 2 if currstat > -8 & enddatestat == 0 + + gen endday = intdatd_dv + gen endmonth = intdatm_dv + gen endyear = intdaty_dv + + replace endday = statendd if enddatestat > 2 + replace endmonth = statendm if enddatestat > 2 + replace endyear = statendy4 if enddatestat > 2 + + gen espstat = nextstat + replace espstat = nextelse + 2 if nextelse > 0 + replace espstat = nextelse if nextstat == 2 & nextelse < 0 + replace espstat = 1 if nextjob == 1 + replace espstat = 2 if nextjob > 1 + replace espstat = 2 if nextjob > -8 & nextjob < 0 + + gen spell = sp2 + 1 + gen wave = ${wv} + + keep pidp spell wave endday endmonth endyear enddatestat espstat lintdatd lintdatm lintdaty intdatm_dv intdaty_dv + save ${wp}sp2, replace +} + +cap log close diff --git a/input/InitialPopulations/compile/do_emphist/05_Newentrant1.do b/input/InitialPopulations/compile/do_emphist/05_Newentrant1.do new file mode 100644 index 000000000..f8efe7ca9 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/05_Newentrant1.do @@ -0,0 +1,191 @@ +/************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS +* FILE: 05_Newentrant1.do +* +* PURPOSE: +* Constructs “new entrant” employment spells for all available waves (A–latest). +* For each wave, identifies individuals who recently entered employment, +* infers start and end dates, and creates wave-specific spell files. +* +* NOTES: +* Patryk’s comment said that b_jbbgdat{y,m,d} were missing from earlier release data +* and obtained from Graham. It looks like they were added to the new release, +* therefore special treatment of wave B is no longer needed. +*************************************************************************************************/ + +cap log close +log using "${dir_log_emphist}/05_Newentrant1.log", replace + +/****************************************************************************** + * WAVE A: process separately (Nick Buck original logic) + ******************************************************************************/ +di as text "------------------------------------------------------" +di as text "Processing new entrant spells for wave a (numeric 1)" +di as text "------------------------------------------------------" + +capture use ${dir_ukhls_data}/a_indresp.dta, clear + + tab a_jbbgm a_jbsemp + tab a_jbbgy if a_jbbgm > 0 & a_jbsemp > 0 + tab a_jbbgy if a_jbbgm < 0 & a_jbsemp > 0 + tab a_jbhad + drop if a_ivfio==2 + rename a_* * + gen spell=0 + gen wave=1 + gen espstat=jbstat + replace espstat=1 if jbsemp==2 + replace espstat=2 if jbsemp==1 + tab espstat jbsemp + gen endyear=intdaty_dv + gen endmonth=intdatm_dv + gen endday=intdatd_dv + gen startmonth=-9 + gen startyear=-9 + gen startday=-9 + replace startyear=jbbgy if jbsemp > 0 + replace startmonth=jbbgm if jbsemp > 0 + replace startday=jbbgd if jbsemp > 0 + tab jlendm jbhad + replace startyear=jlendy if jbhad==1 + replace startmonth=jlendm if jbhad==1 + replace startday=1 if jbhad==1 + tab jbhad jbsemp + tab jbstat if jbhad==2 + replace startyear=2007 if jbhad==2 + replace startmonth=1 if jbhad==2 + replace startday=1 if jbhad==2 + tab startyear + tab jbstat if startyear < 0 + replace startyear=2007 if startyear < 0 + tab startyear if startmonth < 0 + replace startmonth=1 if startmonth < 0 + tab espstat + tab jbstat + gen ne=1 + keep pidp spell wave espstat endyear endmonth endday startmonth startyear startday intdaty_dv intdatm_dv ne + save a_sp0_ne, replace + + di as text "Saved a_sp0_ne.dta successfully." + + +/****************************************************************************** + * LOOP THROUGH WAVES (b ... n) using master globals + ******************************************************************************/ +local wps ${UKHLS_waves_prefixed} +local wvno ${UKHLS_panel_waves_numbers} + +local n : word count `wps' + +forvalues i = 1/`n' { + local wp : word `i' of `wps' // prefix e.g. b_ + local wn : word `i' of `wvno' // numeric e.g. 2 + + di as text "------------------------------------------------------" + di as text "Processing new entrant spells for wave `wp' (numeric `wn')" + di as text "------------------------------------------------------" + + use ${dir_ukhls_data}/`wp'indresp.dta, clear + + * harmonise variable names (=remove wave prefix) + rename `wp'* * + + /************************************************************************************************* + * CHECK THAT REQUIRED VARIABLES EXIST + * (if some essential vars missing, warn and skip) + *************************************************************************************************/ + local reqvars "pidp ivfio hhorig notempchk empchk jbsemp jbstat intdaty_dv intdatm_dv intdatd_dv jbhad jlendy jlendm jbbgy jbbgm jbbgd" + local missing_vars + + foreach v of local reqvars { + capture confirm variable `v' + if _rc local missing_vars "`missing_vars' `v'" + } + + if "`missing_vars'" != "" { + di as error "WARNING: Missing variables in wave `wp': `missing_vars'" + di as text "Skipping this wave..." + continue + } + + /************************************************************************************************* + * BASIC SUMMARY FOR DIAGNOSTICS + *************************************************************************************************/ + di as text "Variable overview for wave `wp':" + summarize jbsemp jbstat jbhad jbbgy jbbgm jbbgd jlendy jlendm + + /************************************************************************************************* + * FILTER AND PROCESS + *************************************************************************************************/ + drop if ivfio == 2 // exclude proxy interviews + capture confirm variable hhorig + if !_rc { + drop if hhorig == 8 // exclude temporary members (if variable exists) + } + + gen aehhas = 1 + replace aehhas = 0 if notempchk == -8 & empchk == -8 + keep if aehhas == 0 + + * Fill jbbg values if missing + capture confirm variable jbbgdaty + if !_rc { + replace jbbgy=jbbgdaty if jbbgy < 0 & jbbgdaty > 0 & jbbgdaty != . + replace jbbgm=jbbgdatm if jbbgm < 0 & jbbgdatm > 0 & jbbgdatm != . + replace jbbgd=jbbgdatd if jbbgd < 0 & jbbgdatd > 0 & jbbgdatd != . + } + + gen spell = 0 + gen wave = `wn' + + * Define employment status at spell end + gen espstat = jbstat + replace espstat = 1 if jbsemp == 2 + replace espstat = 2 if jbsemp == 1 + + gen endyear = intdaty_dv + gen endmonth = intdatm_dv + gen endday = intdatd_dv + + * Default missing start dates + gen startyear = -9 + gen startmonth = -9 + gen startday = -9 + + * Fill start date from job start info (if employed) + replace startyear = jbbgy if jbsemp > 0 + replace startmonth = jbbgm if jbsemp > 0 + replace startday = jbbgd if jbsemp > 0 + + * For those who had a job previously (jbhad == 1) + replace startyear = jlendy if jbhad == 1 + replace startmonth = jlendm if jbhad == 1 + replace startday = 1 if jbhad == 1 + + * If no job since 2007, assign default early date + replace startyear = 2007 if jbhad == 2 | startyear < 0 + replace startmonth = 1 if jbhad == 2 | startmonth < 0 + replace startday = 1 if jbhad == 2 | startday < 0 + + * Flag for new entrant + gen ne = 1 + + /************************************************************************************************* + * SAVE WAVE-SPECIFIC SPELL FILE + *************************************************************************************************/ + keep pidp spell wave espstat endyear endmonth endday /// + startmonth startyear startday intdaty_dv intdatm_dv ne + + save `wp'sp0_ne, replace + + di as text "Saved `wp'sp0_ne.dta successfully." +} + +/****************************************************************************** + * END + ******************************************************************************/ +di as text "------------------------------------------------------" +di as text "All available waves processed. Check logs for warnings." +di as text "------------------------------------------------------" + +cap log close diff --git a/input/InitialPopulations/compile/do_emphist/06_Aspells1.do b/input/InitialPopulations/compile/do_emphist/06_Aspells1.do new file mode 100644 index 000000000..f0bcf2913 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/06_Aspells1.do @@ -0,0 +1,160 @@ +/************************************************************************************************* +* PROJECT: UKMOD update – create employment history data from UKHLS +* FILE: 06_Aspells1.do +* +* PURPOSE: +* Combines all wave-specific employment spell files (sp0, sp0_ne, sp1, sp2) +* into a single dataset covering all available waves. +* Derives consistent start and end dates, imputes missing dates, +* and removes invalid or inconsistent spells. +* +* NOTES: +* - BHPS-origin members receive approximate start dates if missing. +*************************************************************************************************/ + +cap log close +log using "${dir_log_emphist}/06_Aspells1.log", replace + +/************************************************************************************************* + * INITIALISE AND APPEND SPELL FILES + *************************************************************************************************/ + +di as text "------------------------------------------------------" +di as text "Combining wave-specific spell files into one dataset" +di as text "------------------------------------------------------" + +* Start with wave a +use a_sp0_ne, clear + +* Loop through later waves using global list from master file +local wps ${UKHLS_waves_prefixed} +local n : word count `wps' + +forvalues i = 1/`n' { + local wp : word `i' of `wps' + di as text "Appending spell files for wave `wp'..." + + capture append using `wp'sp0 + capture append using `wp'sp0_ne + capture append using `wp'sp1 + capture append using `wp'sp2 +} + +di as text "All wave-specific spell files appended successfully." + +/************************************************************************************************* + * MERGE WITH CROSS-WAVE IDENTIFIER + *************************************************************************************************/ + +di as text "Merging with xwaveid file to obtain memorig variable..." +merge m:1 pidp using ${dir_ukhls_data}/xwaveid, keepusing(memorig) +keep if _merge == 3 +drop _merge + +/************************************************************************************************* + * IMPUTE MISSING BHPS INTERVIEW DATES + *************************************************************************************************/ + +di as text "Applying BHPS date fix for legacy members..." +gen bhps = 0 +replace bhps = 1 if memorig > 2 & memorig < 7 + +replace startyear = 2008 if bhps == 1 & startyear == -9 +replace startmonth = 9 if bhps == 1 & startmonth == -9 +replace startday = 1 if bhps == 1 & startday == -9 + +/************************************************************************************************* + * ADJUST START AND END DATES USING INTERVIEW TIMING + *************************************************************************************************/ + +di as text "Adjusting spell dates relative to previous and current interviews..." + +gen valdat1 = 0 +replace valdat1 = 1 if lintdaty > 0 & lintdatm > 0 & endyear > 0 & endmonth > 0 +gen durat1 = 12 * (endyear - lintdaty) + (endmonth - lintdatm) if valdat1 == 1 + +gen valdat2 = 0 +replace valdat2 = 1 if lintdaty > 0 & lintdatm > 0 & startyear > 0 & startmonth > 0 +gen durat2 = 12 * (startyear - lintdaty) + (startmonth - lintdatm) if valdat2 == 1 + +replace endyear = lintdaty if durat1 < 0 +replace endmonth = lintdatm if durat1 < 0 +replace endday = lintdatd if durat1 < 0 +replace startyear = lintdaty if durat2 < 0 +replace startmonth = lintdatm if durat2 < 0 +replace startday = lintdatd if durat2 < 0 + +/************************************************************************************************* + * FILL START DATES FROM PREVIOUS SPELLS + *************************************************************************************************/ + +sort pidp wave spell +replace startyear = endyear[_n-1] if spell > 1 +replace startmonth = endmonth[_n-1] if spell > 1 +replace startday = endday[_n-1] if spell > 1 + +/************************************************************************************************* + * COMPUTE MIDPOINT DATES (FOR MISSING VALUES) + *************************************************************************************************/ + +gen lint00 = 12 * (lintdaty - 2000) + lintdatm if lintdaty > 0 & lintdatm > 0 +gen int00 = 12 * (intdaty_dv - 2000) + intdatm_dv if intdaty_dv > 0 & intdatm_dv > 0 +gen interval = int00 - lint00 +gen mint00 = lint00 + round(interval / 2) +gen midyear = 2000 + int(mint00 / 12) +gen midmonth = mint00 - 12 * int(mint00 / 12) +replace midyear = midyear - 1 if midmonth == 0 +replace midmonth = 12 if midmonth == 0 + +/************************************************************************************************* + * MANUAL IMPUTATIONS FOR PARTIAL MISSING MONTHS + *************************************************************************************************/ + +replace endmonth = 1 if endyear == intdaty_dv & endmonth < 0 +replace endmonth = 12 if endyear == lintdaty & endmonth < 0 +replace endmonth = 6 if endyear > lintdaty & endyear < intdaty_dv & endmonth < 0 + +replace startmonth = endmonth[_n-1] if spell == 1 & startmonth < 0 & startyear > 0 & endmonth[_n-1] > 0 + +gen valstart = (startmonth > 0 & startyear > 0) +gen valend = (endmonth > 0 & endyear > 0) + +replace startmonth = midmonth if valstart == 0 & midmonth != . +replace startyear = midyear if valstart == 0 & midyear != . +replace endmonth = midmonth if valend == 0 & midmonth != . +replace endyear = midyear if valend == 0 & midyear != . + +/************************************************************************************************* + * COMPUTE SPELL DURATION AND VALIDATION + *************************************************************************************************/ + +gen valdat = (startyear > 0 & startmonth > 0 & endyear > 0 & endmonth > 0) +gen durat = 12 * (endyear - startyear) + (endmonth - startmonth) if valdat == 1 + +save allspells1, replace + +/************************************************************************************************* + * FILTER AND CLEAN SPELLS + *************************************************************************************************/ + +use allspells1, clear + +gen d2 = (valdat == 0) +bys pidp: egen nd2 = sum(d2) +tab nd2 + +keep if nd2 == 0 +keep if durat >= 0 +drop if durat == . +drop if espstat < 0 + +save allspells1ok, replace + +/************************************************************************************************* + * END + *************************************************************************************************/ +di as text "------------------------------------------------------" +di as text "All spells processed and saved as allspells1ok.dta" +di as text "------------------------------------------------------" + +cap log close diff --git a/input/InitialPopulations/compile/do_emphist/07_Empcal1a.do b/input/InitialPopulations/compile/do_emphist/07_Empcal1a.do new file mode 100644 index 000000000..8dc71ee78 --- /dev/null +++ b/input/InitialPopulations/compile/do_emphist/07_Empcal1a.do @@ -0,0 +1,324 @@ +/************************************************************************************************* +* PROJECT: UKMOD update – create employment calendar and per-wave employment history +* FILE: 07_Empcal1a.do +* +* PURPOSE: +* - Build a monthly employment calendar (2007 onward) from all employment spells. +* - Derive per-wave employment history variables needed for UKMOD. +* - Output one per-wave file (b_emphist, ..., n_emphist) with summary measures. +* +* INPUTS: +* allspells1ok.dta - individual-level employment spells constructed in 06_Aspells1.do +* ${original_data}\_indresp.dta - wave-specific interview response data +* +* OUTPUTS: +* ${data}\_emphist.dta - per-wave employment history summary files +* ${data}\temp_liwwh.dta - long file with all waves appended +*************************************************************************************************/ +local baseyr 2007 //==> All subsequent month indexing is relative to January 2007. + +use allspells1ok, clear // Load the prepared spell data + +*-------------------------------------------------------------* +* Convert start and end dates into months since base year (Jan 2007) +*-------------------------------------------------------------* + +gen stmy07 = 12 * (startyear - `baseyr') + startmonth +gen enmy07 = 12 * (endyear - `baseyr') + endmonth + +*-------------------------------------------------------------* +* Simplified employment status: 2 = employed, 1 = not employed +*-------------------------------------------------------------* +fre espstat +gen emp=1 if espstat > 2 +replace emp=2 if espstat < 3 +tab espstat emp + +*-------------------------------------------------------------* +* Determine full observed month range +*-------------------------------------------------------------* +summ enmy07, meanonly +local maxm = r(max) +local minm = 1 // start from month 1 to avoid negatives + +di as txt "Detected month range: " as res "`minm'–`maxm' (" as res `=`maxm'-`minm'+1' " months total)" + +*-------------------------------------------------------------* +* Generate monthly employment indicators (esp# = status each month) +*-------------------------------------------------------------* +forvalues i = `minm'/`maxm' { + gen esp`i' = 0 + replace esp`i' = emp if `i' >= stmy07 & `i' <= enmy07 +} + +*-------------------------------------------------------------* +* Collapse multiple spells per person (2 overrides 1) +*-------------------------------------------------------------* +forvalues i = `minm'/`maxm' { + bys pidp: egen memp`i' = max(esp`i') +} + +*-------------------------------------------------------------* +* Keep one row per person and retain key variables +*-------------------------------------------------------------* +bys pidp: gen seq = _n +keep if seq == 1 +keep pidp memorig memp`minm'-memp`maxm' + +/*-------------------------------------------------------------* +* Count employed months per financial year (April–March) ==> not sure is this is needed so coded out for now +*-------------------------------------------------------------* +summ memp*, meanonly + +local fy_start = `baseyr' +local fy_end = floor(`baseyr' + (`maxm' + 8) / 12) // +8 ensures FY covers Apr–Mar + +forvalues y = `fy_start'/`fy_end' { + local fy = substr("`y'",3,2) // e.g. 2007 → "07" + local start = (12 * (`y' - `baseyr')) + 4 // April of FY + local end = (12 * (`y' - `baseyr' + 1)) + 3 // March next year + + * Clip to observed range + if `start' < `minm' local start = `minm' + if `end' > `maxm' local end = `maxm' + + * Count months employed (status = 2) + gen efy`fy' = 0 + forvalues i = `start'/`end' { + replace efy`fy' = efy`fy' + 1 if memp`i' == 2 + } + + di as txt "FY" `y' "/" `= `y'+1' " → months " as res "`start'–`end'" +} +*/ + +save empcal1a, replace +/*we end up with a monthy calendar of activity (i.e. employed or not) for each individual from Jan 2007*/ + +/************************************************************************************************* + Derive wave-specific employment history summaries + -------------------------------------------------------------------- + For each wave, merge with interview date, calculate employment duration + up to that interview month (liwwh), and short-term employment indicators: + empmonth - months employed in 6 months before interview + mismonth - months missing in last 6 months + empmonth12 - months employed in 12 months before interview +*************************************************************************************************/ + +local waves $UKHLS_panel_waves +//local waves b + +foreach w of local waves { + + di "---------------------------------------------------------" + di "Processing WAVE `w' ..." + di "---------------------------------------------------------" + + use empcal1a, clear + + merge 1:1 pidp using ${dir_ukhls_data}/`w'_indresp, /// + keepusing(`w'_intdatm_dv `w'_intdaty_dv `w'_ivfio) + keep if _merge == 3 + drop _merge + drop if `w'_ivfio == 2 // exclude proxy interviews + + * Interview month index relative to base year (Jan 2007) + gen inmy07 = 12*(`w'_intdaty_dv - `baseyr') + `w'_intdatm_dv + + + *------------------------------------------ + * Total months employed up to interview + *------------------------------------------ + gen liwwh = 0 + summarize inmy07, meanonly + local maxm = r(max) + forvalues i = 1/`maxm' { + replace liwwh = liwwh + 1 if memp`i' == 2 & `i' <= inmy07 + } + + *------------------------------------------ + * Short-term employment summaries + *------------------------------------------ + summarize inmy07, meanonly + local maxm = r(max) + local start6m = `maxm' - 6 + local start12m = `maxm' - 12 + + gen empmonth = 0 + gen mismonth = 0 + gen empmonth12 = 0 + + forvalues i = `start6m'/`maxm' { + replace empmonth = empmonth + 1 if memp`i' == 2 & inmy07 == `maxm' + replace mismonth = mismonth + 1 if memp`i' == 0 & inmy07 == `maxm' + } + forvalues i = `start12m'/`maxm' { + replace empmonth12 = empmonth12 + 1 if memp`i' == 2 & inmy07 == `maxm' + } + + *------------------------------------------ + * Keep and label key variables + *------------------------------------------ + keep pidp `w'_intdatm_dv `w'_intdaty_dv liwwh empmonth mismonth empmonth12 //efy1 efy2 + label var liwwh "Total months in employment up to current interview" + label var empmonth "Months employed in last 6 months before interview" + label var mismonth "Months missing in last 6 months before interview" + label var empmonth12 "Months employed in last 12 months before interview" + + save `w'_emphist, replace +} + +di as txt "All waves (B–N) processed successfully." + +*------------------------------------------ +* Combine per-wave employment history files +*------------------------------------------ + +* Convert global into a local list and remove first letter (because we start from wave c) +local waves $UKHLS_panel_waves +local first : word 1 of `waves' +local waves : list waves - first + +display "Waves to append: `waves'" + +use b_emphist, clear +gen wave = "b" + +foreach w of local waves { + display "Appending wave `w'..." + append using `w'_emphist, generate(flag_`w') + replace wave = "`w'" if flag_`w' == 1 + drop flag_`w' +} + +* generate wave identifier +gen swv = . + +local letters $UKHLS_panel_waves +local numbers $UKHLS_panel_waves_numbers + +local n : word count `letters' +forval i = 1/`n' { + local wv : word `i' of `letters' + local num : word `i' of `numbers' + replace swv = `num' if wave == "`wv'" +} + +gen idperson=pidp + +save temp_liwwh.dta, replace + +duplicates report swv idperson +bys swv: sum liwwh + +cap log close + +/************************************************************************************** +* clean-up and exit +*************************************************************************************/ + +#delimit ; +local files_to_drop +allspells1.dta +a_sp0_ne.dta +bhps_lint.dta +b_emphist.dta +b_lint.dta +b_sp0.dta +b_sp0_ne.dta +b_sp1.dta +b_sp1a.dta +b_sp2.dta +c_emphist.dta +c_lint.dta +c_sp0.dta +c_sp0_ne.dta +c_sp1.dta +c_sp1a.dta +c_sp2.dta +d_emphist.dta +d_lint.dta +d_sp0.dta +d_sp0_ne.dta +d_sp1.dta +d_sp1a.dta +d_sp2.dta +e_emphist.dta +e_lint.dta +e_sp0.dta +e_sp0_ne.dta +e_sp1.dta +e_sp1a.dta +e_sp2.dta +f_emphist.dta +f_lint.dta +f_sp0.dta +f_sp0_ne.dta +f_sp1.dta +f_sp1a.dta +f_sp2.dta +g_emphist.dta +g_lint.dta +g_sp0.dta +g_sp0_ne.dta +g_sp1.dta +g_sp1a.dta +g_sp2.dta +h_emphist.dta +h_lint.dta +h_sp0.dta +h_sp0_ne.dta +h_sp1.dta +h_sp1a.dta +h_sp2.dta +intdate.dta +intdate1.dta +i_emphist.dta +i_lint.dta +i_sp0.dta +i_sp0_ne.dta +i_sp1.dta +i_sp1a.dta +i_sp2.dta +j_emphist.dta +j_lint.dta +j_sp0.dta +j_sp0_ne.dta +j_sp1.dta +j_sp1a.dta +j_sp2.dta +k_emphist.dta +k_lint.dta +k_sp0.dta +k_sp0_ne.dta +k_sp1.dta +k_sp1a.dta +k_sp2.dta +l_emphist.dta +l_lint.dta +l_sp0.dta +l_sp0_ne.dta +l_sp1.dta +l_sp1a.dta +l_sp2.dta +m_emphist.dta +m_lint.dta +m_sp0.dta +m_sp0_ne.dta +m_sp1.dta +m_sp1a.dta +m_sp2.dta +n_emphist.dta +n_lint.dta +n_sp0.dta +n_sp0_ne.dta +n_sp1.dta +n_sp1a.dta +n_sp2.dta + ; +#delimit cr // cr stands for carriage return + +foreach file of local files_to_drop { + erase "$dir_data_emphist/`file'" +} + diff --git a/input/reg_fertility.xlsx b/input/reg_fertility.xlsx index 27644860a..d36df9aa3 100644 Binary files a/input/reg_fertility.xlsx and b/input/reg_fertility.xlsx differ diff --git a/input/reg_home_ownership.xlsx b/input/reg_home_ownership.xlsx index b272ac25a..25b15ec1b 100644 Binary files a/input/reg_home_ownership.xlsx and b/input/reg_home_ownership.xlsx differ diff --git a/input/reg_labourSupplyUtility.xlsx b/input/reg_labourSupplyUtility.xlsx index 8d8403ce9..b9ccb4585 100644 Binary files a/input/reg_labourSupplyUtility.xlsx and b/input/reg_labourSupplyUtility.xlsx differ diff --git a/input/reg_leaveParentalHome.xlsx b/input/reg_leaveParentalHome.xlsx index b41d9e2e9..39723eb58 100644 Binary files a/input/reg_leaveParentalHome.xlsx and b/input/reg_leaveParentalHome.xlsx differ