// ***************************************************************************** // PROGRAM - OMCA2 // Created by Jane Fry and Clare Boulton, Productivity Commission, Melbourne // August, 2013. // ***************************************************************************** // THIS PROGRAM PERFORMS DESCRIPTIVE ANALYIS OF THE SEQUENCES OF ACTIVITIES IN THE // PATHWAYS. THE DATA REQUIRED FOR THIS PROGRAM CAN BE GENERATED BY RUNNING THE OMCA1 // PROGRAM AND LINES 65-72 OF CODE IN THE MASTER PROGRAM. THE VARIABLES IN THE // DATASET AFTER RUNNING THAT CODE INCLUDE THE PATHWAY MEMBERSHIP VARIABLE (groupWARD_5 // FOR YOUTH), OM1, THE ACTIVITY VARIABLE, ORDER, AHGSEX, AHGAGE AND W1AGE. // FOR EACH (YOUTH) PATHWAY, SEQUENCE INDEX PLOTS ARE PRODUCED AND THE CONCENTRATION OF // SEQUENCES IS ANALYSED. FOR THE (YOUTH) AGE SEGMENT AS A WHOLE AND FOR EACH (YOUTH) // PATHWAY, EXCEL WORKSHEETS WITH DATA THAT CAN BE USED TO PRODUCE CHRONOGRAPHS ARE // PRODUCED. THE PROGRAM ALSO CREATES STATA DATA FILES THAT CAN BE MERGED WITH OTHER // (ANNUAL) HILDA DATA TO PERFORM DESCRIPTIVE ANALYSIS OF THE INDIVIDUALS IN THE PATHWAYS. // This program creates 3 main output files: // "for_charts_youths.dta" // "youths.dta" // example: youths only // "youthpcent_merge.out" // ***************************************************************************** // SEQUENCE INDEX PLOTS // ***************************************************************************** // STEP 1: Recode activity variable to minimise overplotting // Re-order categories in the activity variable from most to // least-frequently occuring. // For youths. Initial categorisation for all age groups was as follows: // 1 "Study only" 2 "Work & study" 3 "Work only" 4 "Unemployed" 5 "NILF" //egen durs = sqlength(), element(1) // study only //egen durws = sqlength(), element(2) // work & study //egen durw = sqlength(), element(3) // work only //egen duru = sqlength(), element(4) // unemployed //egen durn = sqlength(), element(5) // NILF // summarize durs durws durw duru durn // change to new code - from most frequent activity for youths to least // 1 "Work only" 2 "Work & study" 3 "Study only" 4 "NILF" 5 "Unemployed" //summarize activity // before recode recode activity (1= 3) (2= 2) (3= 1) (4= 5) (5= 4) label define activity1 1 "Work only" 2 "Work & study" /// 3 "Study only" 4 "NILF" 5 "Unemployed" label values activity activity1 //summarize activity // after recode to check // STEP 2: save results for later access to produce SI plots without having to // do OMCA again. save for_charts_youths, replace // STEP 3: Generate and save sequence index plots for each pathway // Note: ordering of colors needs to correspond to the order of activities. // So, in this example ltbluishgray correspondes to activity=1 which has been coded // above as "work only". Likewise, khaki corresponds to activity=2 (work and study); // lavender corresponds to activity=3 (study only); cranberry correspondes to // activity=4 (NILF); and dknavy corresponds to activity=5 (unemployed). // // Change number of pathways in the first line of code below as required. // Change pathway no. variable and titles (2nd line) and file names (3rd line) as required. forvalues p=1/5 { sqindexplot if groupWARD_5==`p', name("Youths_`p'", replace) /// saving(Youths_`p', replace) order(om1) /// legend(on rows(1) bmargin(t=0) symxsize(3) symysize(2) region(color(gs16) /// fcolor(gs16) lcolor(gs16)) position(6) bexpand span) /// xtitle("Year", margin(vsmall) yoffset(-2.0)) /// ytitle("Individuals", margin(small) width(150) ) /// color(ltbluishgray khaki lavender cranberry dknavy) aspectratio(0.6) /// graphregion(fcolor(gs16) ifcolor(gs16) lcolor(gs16) ilcolor(gs16)) /// plotregion(fcolor(gs16) ifcolor(gs16)lcolor(gs16) ilcolor(gs16)) /// ylabel(,labsize(small) labgap(*2)) /// xlabel(0 "2000" 12 "2001" 24 "2002" 36 "2003" 48 "2004" 60 "2005" 72 "2006" /// 84 "2007" 96 "2008" 108 "2009" 120 "2010", labsize(small)) rbar } // Each plot is saved in a Stata graph file (.gph) and can be accessed later // without the need for any prior commands. // ***************************************************************************** // DESCRIPTIVE ANALYSIS BY PATHWAY // ***************************************************************************** // concentration of sequences // For further information about descriptive analysis of sequence data using // the SQ suite of commands see: // Brzinsky-Fay, C., Kohler, U. and Luniak, M. 2006, 'Sequence analysis with Stata', // The Stata Journal, vol. 6, no. 4, pp. 435-60. sqdes // Concentraction of sequences by pathway local g=5 // (5 pathways) forvalues c=1/`g' { sqdes if groupWARD_`g'==`c' sqtab if groupWARD_`g'==`c', ranks (1/20) so } // Percentage of time in each activity. // Ensure element numbers correspond to correct activity (code below // corresponds to activites for youth after the recode above). local g=5 forvalues c=1/`g' { egen c`c'durw = sqlength() if groupWARD_`g'==`c', element(1) // generate new variable with number of periods working only gen c`c'pcw=(c`c'durw/120)*100 //percentage of time working only egen c`c'durws = sqlength() if groupWARD_`g'==`c', element(2) // generate new variable with number of periods working and studying gen c`c'pcws=(c`c'durws/120)*100 //percentage of time working and studying egen c`c'durs = sqlength() if groupWARD_`g'==`c', element(3) // generate new variable with number of periods studying only gen c`c'pcs=(c`c'durs/120)*100 //percentage of time studying only egen c`c'durn = sqlength() if groupWARD_`g'==`c', element(4) // generate new variable with number of periods NILF gen c`c'pcn=(c`c'durn/120)*100 //percentage of time NILF egen c`c'duru = sqlength() if groupWARD_`g'==`c', element(5) // generate new variable with number of periods unemployed gen c`c'pcu=(c`c'duru/120)*100 //percentage of time unemployed } // Calculate average percentage of time in each activity for each pathway local g=5 forvalues c=1/`g' { summarize c`c'pcw summarize c`c'pcws summarize c`c'pcs summarize c`c'pcn summarize c`c'pcu } // For each pathway, get gender mix and average age. local g=5 forvalues c=1/`g' { tab ahgsex if groupWARD_`g'==`c' summarize ahgage if groupWARD_`g'==`c' } // ***************************************************************************** // CHRONOGRAPHS // A chronograph shows the percentage of individuals in a given group // (i.e. age segment or pathway) in each of the five activities (2-D areas stacked // along the y axis; totalling 100 per cent), over 120 months (along the x axis). // The code below generates the data required to produce a chronograph for an age segment // (youth, for example) and then for each of the youth pathways. // The data required for each group are 5 variables, with each of the 5 variables // representing the percentage of individuals in 1 of the 5 activities over 120 months. // In the first step to generating this data, numbers ('counts') of individuals in each // activity are generated based on the activity variables for all individuals in the // group. Based on these counts, percentages of individuals in each activity are calculated. // Once the 5 variables are correctly set up, all but one individual (the 1st is used // here) can be deleted from the dataset because all individuals should have the same // data. // But before observations can be dropped, care must be taken to ensure that // the observation kept will include all of the required data. // Care also needs to be taken to correctly distinguish between missing counts/percentages // and zero counts/percentages. // ***************************************************************************** // At this point the data includes the results of OMCA (incl. groupWARD_5), ahgsex, ahgage, w1age, // the recoded activity variable (in long format so only 1 variable), order, as well as the variables // derived above to describe percentage of time in each activity. set more off, permanently // there are lots of missing values generated by the code below // the chronograph requires data about the percentages of individuals in the activities rather than // percentages of time spent in the activities, so the data must be in wide format rather than long. reshape wide activity, i(xwaveid) j(order) // drop egen variables (percentages across time for each i) drop c* // ***************************************************************************** // ASIDE: to save data in wide format for merging with annual HILDA data later: keep xwaveid groupWARD_5 activity* w1age // need xwaveid for merging // rename the pathway number variable consistently across age segments //(execute from "preserve" to "restore" in one go) preserve rename groupWARD_5 pathno // save separate file for (e.g.) youths to enable merging with annual HILDA data save "youths.dta", replace restore // ***************************************************************************** // Chronograph for the youth age segment as a whole // Set up the variables, and for each activity count the number of individuals // in that activity in that month. forvalues y =1/120 { egen workcc`y' = count(activity`y') if activity`y'==1 egen worstucc`y'= count(activity`y') if activity`y'==2 egen studycc`y' = count(activity`y') if activity`y'==3 egen nilfcc`y' = count(activity`y') if activity`y'==4 egen unempcc`y' = count(activity`y') if activity`y'==5 } // There are two problems with these count variables. // First, the count for an activity will be missing if the individual is not in that // particular activity that month. The "minimum" command is used below to generate // a new set of "count variables" which will include the minimum number -- either the // count or the missing value (which Stata treats as a very large positive number)-- // across all individuals for that month. So this step will 'replace' missing with // non-missing data. forvalues y =1/120 { egen workc`y' = min(workcc`y') egen worstuc`y' = min(worstucc`y') egen studyc`y' = min(studycc`y') egen nilfc`y' = min(nilfcc`y') egen unempc`y' = min(unempcc`y') } // drop previous version of "count variables" drop workcc* worstucc* studycc* nilfcc* unempcc* // Second, some missing data remain because there were no individuals in that activity for that month. // This recode will change missings to 0's where there are non-missing counts for at least one other activity // for the same month (so, if there are genuine missing values for some reason, they will remain). forvalues y = 1/120 { recode workc`y' (. = 0) if nilfc`y'!=. | unempc`y'!=. | worstuc`y'!=. | studyc`y'!=. recode worstuc`y' (. = 0) if workc`y'!=. | unempc`y'!=. | nilfc`y'!=. | studyc`y'!=. recode studyc`y' (. = 0) if workc`y'!=. | unempc`y'!=. | worstuc`y'!=. | nilfc`y'!=. recode nilfc`y' (. = 0) if workc`y'!=. | unempc`y'!=. | worstuc`y'!=. | studyc`y'!=. recode unempc`y' (. = 0) if workc`y'!=. | nilfc`y'!=. | worstuc`y'!=. | studyc`y'!=. } // Generate percentage of (youth) in each activity at each month. forvalues y=1/120 { gen workp`y' = workc`y'/ (nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100 gen worstup`y' = worstuc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100 gen studyp`y' = studyc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100 gen nilfp`y' = nilfc`y'/ (nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100 gen unempp`y' = unempc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100 } // Drop "count variables" forvalues y=1/120 { drop workc`y' worstuc`y' studyc`y' nilfc`y' unempc`y' } // Select variables to be exported for the chronograph //(execute from "preserve" to "restore" in one go) preserve keep in 1 // only need the first observation as they all have the same data. keep xwaveid workp* worstup* studyp* nilfp* unempp* // convert back to long format reshape long workp worstup studyp nilfp unempp, i(xwaveid) j(order) drop xwaveid // save in csv format so that data can be opened in excel to create the chronograph outsheet work* worstu* study* nilf* unemp* using "youthpcent_merge.out", /// comma noquote nolabel replace restore drop workp* worstup* studyp* nilfp* unempp* // clear out chronograph variables. // ***************************************************************************** // chronographs for each pathway local g = 5 forvalues c = 1/`g' { keep xwaveid groupWARD_5 activity* // For each pathway, set up the variables. For each activity count the number of // individuals in that activity for each month. forvalues y =1/120 { egen work`c'cc`y' = count(activity`y') if activity`y'==1 & groupWARD_5==`c' egen worstu`c'cc`y' = count(activity`y') if activity`y'==2 & groupWARD_5==`c' egen study`c'cc`y' = count(activity`y') if activity`y'==3 & groupWARD_5==`c' egen nilf`c'cc`y' = count(activity`y') if activity`y'==4 & groupWARD_5==`c' egen unemp`c'cc`y' = count(activity`y') if activity`y'==5 & groupWARD_5==`c' } // There are two problems with these count variables. // First, the count for an activity will be missing if the individual is not in that // particular activity that month. The "minimum" command is used below to generate // a new set of "count variables" which will include the minimum number -- either the // count or the missing value (which Stata treats as a very large positive number)-- // across all individuals for that month. So this step will 'replace' missing with // non-missing data. forvalues y =1/120 { egen work`c'c`y' = min(work`c'cc`y') if groupWARD_5==`c' egen worstu`c'c`y' = min(worstu`c'cc`y') if groupWARD_5==`c' egen study`c'c`y' = min(study`c'cc`y') if groupWARD_5==`c' egen nilf`c'c`y' = min(nilf`c'cc`y') if groupWARD_5==`c' egen unemp`c'c`y' = min(unemp`c'cc`y') if groupWARD_5==`c' } // drop previous version of "count variables" drop work`c'cc* worstu`c'cc* study`c'cc* nilf`c'cc* unemp`c'cc* // Second, some missings remain because there were no individuals in that activity for that month. // This recode will change missings to 0's where there are non-missing counts for at least one other activity // for the same month (so, if there are genuine missing values for some reason, they will remain). forvalues y = 1/120 { recode work`c'c`y' (. = 0) if groupWARD_5==`c' & nilf`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | study`c'c`y'!=. recode worstu`c'c`y'(. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | nilf`c'c`y'!=. | study`c'c`y'!=. recode study`c'c`y' (. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | nilf`c'c`y'!=. recode nilf`c'c`y' (. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | study`c'c`y'!=. recode unemp`c'c`y' (. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | nilf`c'c`y'!=. | worstu`c'c`y'!=. | study`c'c`y'!=. } // By pathway, generate percentage of individuals in each activity at each month. forvalues y=1/120 { gen work`c'p`y' = work`c'c`y'/ (nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100 gen worstu`c'p`y' = worstu`c'c`y'/(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100 gen study`c'p`y' = study`c'c`y'/ (nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100 gen nilf`c'p`y' = nilf`c'c`y'/ (nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100 gen unemp`c'p`y' = unemp`c'c`y'/ (nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100 } // drop "count variables" forvalues y=1/120 { drop work`c'c`y' worstu`c'c`y' study`c'c`y' nilf`c'c`y' unemp`c'c`y' } // save as separate workbook for each pathway // select variables to be exported for the chronograph (execute from "preserve" to "restore" in one go) preserve drop if groupWARD_5!=`c' keep in 1 // only need the first observation as they all have the same data. keep xwaveid work`c'p* worstu`c'p* study`c'p* nilf`c'p* unemp`c'p* // convert back to long format reshape long work`c'p worstu`c'p study`c'p nilf`c'p unemp`c'p, i(xwaveid) j(order) drop xwaveid save "youthpcentpath`c'.dta", replace // save chronograph information for the pathway before // moving on to the next pathway in the age segment restore drop work`c'p* worstu`c'p* study`c'p* nilf`c'p* unemp`c'p* } // merge together information for all chronographs for the age segment and save in one dataset use "youthpcentpath1.dta", clear local g = 5 forvalues c = 2/`g' { sort order merge 1:1 order using "youthpcentpath`c'.dta" drop _merge } // save: save "youthpcent_merge.dta", replace // save merged dataset in csv format so that data can be opened // in excel to create the chronographs outsheet work* worstu* study* nilf* unemp* using "youthpcent_merge.out", /// comma noquote nolabel replace