// *****************************************************************************
// PROGRAM - OMCA2
// Created by Jane Fry and Clare Boulton, Productivity Commission, Melbourne
// August, 2013.
// *****************************************************************************

// THIS PROGRAM PERFORMS DESCRIPTIVE ANALYIS OF THE SEQUENCES OF ACTIVITIES IN THE 
// PATHWAYS. THE DATA REQUIRED FOR THIS PROGRAM CAN BE GENERATED BY RUNNING THE OMCA1
// PROGRAM AND LINES 65-72 OF CODE IN THE MASTER PROGRAM. THE VARIABLES IN THE 
// DATASET AFTER RUNNING THAT CODE INCLUDE THE PATHWAY MEMBERSHIP VARIABLE (groupWARD_5
// FOR YOUTH), OM1, THE ACTIVITY VARIABLE, ORDER, AHGSEX, AHGAGE AND W1AGE. 

// FOR EACH (YOUTH) PATHWAY, SEQUENCE INDEX PLOTS ARE PRODUCED AND THE CONCENTRATION OF 
// SEQUENCES IS ANALYSED. FOR THE (YOUTH) AGE SEGMENT AS A WHOLE AND FOR EACH (YOUTH) 
// PATHWAY, EXCEL WORKSHEETS WITH DATA THAT CAN BE USED TO PRODUCE CHRONOGRAPHS ARE 
// PRODUCED. THE PROGRAM ALSO CREATES STATA DATA FILES THAT CAN BE MERGED WITH OTHER
// (ANNUAL) HILDA DATA TO PERFORM DESCRIPTIVE ANALYSIS OF THE INDIVIDUALS IN THE PATHWAYS.  

// This program creates 3 main output files:
// "for_charts_youths.dta"
// "youths.dta"					// example: youths only
// "youthpcent_merge.out"



// *****************************************************************************
//							SEQUENCE INDEX PLOTS
// *****************************************************************************

// STEP 1: Recode activity variable to minimise overplotting

	// Re-order categories in the activity variable from most to 
	// least-frequently occuring.

	// For youths. Initial categorisation for all age groups was as follows:
	// 1 "Study only" 2 "Work & study" 3 "Work only" 4 "Unemployed" 5 "NILF"
 
	//egen durs 	= sqlength(), element(1) // study only
	//egen durws 	= sqlength(), element(2) // work & study
	//egen durw 	= sqlength(), element(3) // work only
	//egen duru 	= sqlength(), element(4) // unemployed
	//egen durn 	= sqlength(), element(5) // NILF

	// summarize durs durws durw duru durn

	// change to new code - from most frequent activity for youths to least
	// 1 "Work only" 2 "Work & study" 3 "Study only" 4  "NILF"  5 "Unemployed" 

	//summarize activity // before recode

	recode activity (1= 3) (2= 2) (3= 1) (4= 5) (5= 4)
	label define activity1 1 "Work only" 2 "Work & study" ///
	3 "Study only" 4  "NILF"  5 "Unemployed"

	label values activity activity1
	//summarize activity // after recode to check
						
// STEP 2: save results for later access to produce SI plots without having to
// do OMCA again.	
save for_charts_youths, replace


// STEP 3: Generate and save sequence index plots for each pathway

// Note: ordering of colors needs to correspond to the order of activities. 
// So, in this example ltbluishgray correspondes to activity=1 which has been coded
// above as "work only". Likewise, khaki corresponds to activity=2 (work and study);
// lavender corresponds to activity=3 (study only); cranberry correspondes to 
// activity=4 (NILF); and dknavy corresponds to activity=5 (unemployed).
// 
// Change number of pathways in the first line of code below as required.
// Change pathway no. variable and titles (2nd line) and file names (3rd line) as required.
	forvalues p=1/5 {
	sqindexplot if groupWARD_5==`p', name("Youths_`p'", replace) ///
	saving(Youths_`p', replace) order(om1) ///
	legend(on rows(1) bmargin(t=0) symxsize(3) symysize(2) region(color(gs16) ///
	fcolor(gs16) lcolor(gs16)) position(6) bexpand span)   /// 
	xtitle("Year", margin(vsmall) yoffset(-2.0)) ///
	ytitle("Individuals", margin(small) width(150) ) /// 
	color(ltbluishgray khaki lavender cranberry dknavy) aspectratio(0.6) ///
	graphregion(fcolor(gs16) ifcolor(gs16) lcolor(gs16) ilcolor(gs16))  /// 
	plotregion(fcolor(gs16) ifcolor(gs16)lcolor(gs16) ilcolor(gs16))  /// 
	ylabel(,labsize(small) labgap(*2)) ///
	xlabel(0 "2000" 12 "2001" 24 "2002" 36 "2003" 48 "2004" 60 "2005" 72 "2006" ///
	84 "2007" 96 "2008" 108 "2009" 120 "2010", labsize(small)) rbar 
	}

// Each plot is saved in a Stata graph file (.gph) and can be accessed later 
// without the need for any prior commands.
	
	
// *****************************************************************************
//					   DESCRIPTIVE ANALYSIS BY PATHWAY
// *****************************************************************************

// concentration of sequences
// For further information about descriptive analysis of sequence data using 
// the SQ suite of commands see:
//		Brzinsky-Fay, C., Kohler, U. and Luniak, M. 2006, 'Sequence analysis with Stata',
//  	The Stata Journal, vol. 6, no. 4, pp. 435-60.

sqdes

// Concentraction of sequences by pathway
local g=5  // (5 pathways)

forvalues c=1/`g' {
	sqdes if groupWARD_`g'==`c'
	sqtab if groupWARD_`g'==`c', ranks (1/20) so
	}

// Percentage of time in each activity.
// Ensure element numbers correspond to correct activity (code below
// corresponds to activites for youth after the recode above).
local g=5 

forvalues c=1/`g' {
	egen c`c'durw = sqlength() if groupWARD_`g'==`c', element(1) 
	// generate new variable with number of periods working only
	gen c`c'pcw=(c`c'durw/120)*100 	//percentage of time working only
	
	egen c`c'durws = sqlength() if groupWARD_`g'==`c', element(2) 
	// generate new variable with number of periods working and studying
	gen c`c'pcws=(c`c'durws/120)*100 	//percentage of time working and studying  

	egen c`c'durs = sqlength() if groupWARD_`g'==`c', element(3) 
	// generate new variable with number of periods studying only
	gen c`c'pcs=(c`c'durs/120)*100 	//percentage of time studying only 
							
	egen c`c'durn = sqlength() if groupWARD_`g'==`c', element(4) 
	// generate new variable with number of periods NILF
	gen c`c'pcn=(c`c'durn/120)*100 	//percentage of time NILF	
	
	egen c`c'duru = sqlength() if groupWARD_`g'==`c', element(5) 
	// generate new variable with number of periods unemployed
	gen c`c'pcu=(c`c'duru/120)*100 	//percentage of time unemployed							
	}

// Calculate average percentage of time in each activity for each pathway

local g=5

forvalues c=1/`g' {
	summarize c`c'pcw
	summarize c`c'pcws
	summarize c`c'pcs
	summarize c`c'pcn
	summarize c`c'pcu
	}

// For each pathway, get gender mix and average age.
local g=5

forvalues c=1/`g' {
	tab ahgsex if groupWARD_`g'==`c'
	summarize ahgage if groupWARD_`g'==`c'
	}



// *****************************************************************************
// 								CHRONOGRAPHS

// A chronograph shows the percentage of individuals in a given group  
// (i.e. age segment or pathway) in each of the five activities (2-D areas stacked 
// along the y axis; totalling 100 per cent), over 120 months (along the x axis). 

// The code below generates the data required to produce a chronograph for an age segment
// (youth, for example) and then for each of the youth pathways.

// The data required for each group are 5 variables, with each of the 5 variables 
// representing the percentage of individuals in 1 of the 5 activities over 120 months. 

// In the first step to generating this data, numbers ('counts') of individuals in each
// activity are generated based on the activity variables for all individuals in the 
// group. Based on these counts, percentages of individuals in each activity are calculated.
// Once the 5 variables are correctly set up, all but one individual (the 1st is used 
// here) can be deleted from the dataset because all individuals should have the same 
// data. 

// But before observations can be dropped, care must be taken to ensure that 
// the observation kept will include all of the required data. 
// Care also needs to be taken to correctly distinguish between missing counts/percentages
// and zero counts/percentages. 

// *****************************************************************************

// At this point the data includes the results of OMCA (incl. groupWARD_5), ahgsex, ahgage, w1age,
// the recoded activity variable (in long format so only 1 variable), order, as well as the variables 
// derived above to describe percentage of time in each activity. 

set more off, permanently	// there are lots of missing values generated by the code below

// the chronograph requires data about the percentages of individuals in the activities rather than 
// percentages of time spent in the activities, so the data must be in wide format rather than long.
reshape wide activity, i(xwaveid) j(order)

// drop egen variables (percentages across time for each i) 
drop c* 

// *****************************************************************************
// ASIDE: to save data in wide format for merging with annual HILDA data later:

	keep xwaveid groupWARD_5 activity* w1age // need xwaveid for merging
	
// rename the pathway number variable consistently across age segments
//(execute from "preserve" to "restore" in one go)
	preserve
		rename groupWARD_5 pathno
		// save separate file for (e.g.) youths to enable merging with annual HILDA data 
		save "youths.dta", replace
	restore
// *****************************************************************************
	
// Chronograph for the youth age segment as a whole
 
	// Set up the variables, and for each activity count the number of individuals
	// in that activity in that month.
		forvalues y =1/120 {
			egen workcc`y' 	= count(activity`y') if activity`y'==1 
			egen worstucc`y'= count(activity`y') if activity`y'==2 
			egen studycc`y' = count(activity`y') if activity`y'==3
			egen nilfcc`y' 	= count(activity`y') if activity`y'==4  
			egen unempcc`y' = count(activity`y') if activity`y'==5 
			}
			
	// There are two problems with these count variables. 
	// First, the count for an activity will be missing if the individual is not in that 
	// particular activity that month. The "minimum" command is used below to generate 
	// a new set of "count variables" which will include the minimum number -- either the 
	// count or the missing value (which Stata treats as a very large positive number)-- 
	// across all individuals for that month. So this step will 'replace' missing with
	// non-missing data.
		forvalues y =1/120 {	
			egen workc`y' 	= min(workcc`y') 	 
			egen worstuc`y' = min(worstucc`y')
			egen studyc`y' 	= min(studycc`y')
			egen nilfc`y' 	= min(nilfcc`y')
			egen unempc`y' 	= min(unempcc`y') 	
			}	 
			// drop previous version of "count variables"	
			drop workcc* worstucc* studycc* nilfcc* unempcc* 
	
	// Second, some missing data remain because there were no individuals in that activity for that month. 
	// This recode will change missings to 0's where there are non-missing counts for at least one other activity
	// for the same month (so, if there are genuine missing values for some reason, they will remain).
		forvalues y = 1/120 {
			recode workc`y' 	(. = 0) if nilfc`y'!=. | unempc`y'!=. | worstuc`y'!=. | studyc`y'!=. 
			recode worstuc`y'	(. = 0) if workc`y'!=. | unempc`y'!=. | nilfc`y'!=.   | studyc`y'!=. 
			recode studyc`y' 	(. = 0) if workc`y'!=. | unempc`y'!=. | worstuc`y'!=. | nilfc`y'!=.  
			recode nilfc`y' 	(. = 0) if workc`y'!=. | unempc`y'!=. | worstuc`y'!=. | studyc`y'!=.
			recode unempc`y' 	(. = 0) if workc`y'!=. | nilfc`y'!=.  | worstuc`y'!=. | studyc`y'!=.
			}
	
	// Generate percentage of (youth) in each activity at each month.
		forvalues y=1/120 {
			gen workp`y'		= workc`y'/	(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100
			gen worstup`y'		= worstuc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100
			gen studyp`y'		= studyc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100
			gen nilfp`y'		= nilfc`y'/	(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100
			gen unempp`y'		= unempc`y'/(nilfc`y'+ workc`y' + unempc`y' + worstuc`y' + studyc`y')*100
			}
			
	// Drop "count variables"
			forvalues y=1/120 {
			drop workc`y' worstuc`y' studyc`y' nilfc`y' unempc`y'  
			}
			
	// Select variables to be exported for the chronograph
	//(execute from "preserve" to "restore" in one go)
	preserve
		keep in 1 // only need the first observation as they all have the same data.
		keep xwaveid workp* worstup* studyp* nilfp* unempp*  
			
		// convert back to long format 
			reshape long workp worstup studyp nilfp unempp, i(xwaveid) j(order) 
			drop xwaveid					
			
		// save in csv format so that data can be opened in excel to create the chronograph
			outsheet work* worstu* study* nilf* unemp* using "youthpcent_merge.out", ///
			comma noquote nolabel replace 
				
	restore	
		drop workp* worstup* studyp* nilfp* unempp* // clear out chronograph variables.

// *****************************************************************************	

// chronographs for each pathway 
local g = 5
forvalues c = 1/`g' {

	keep xwaveid groupWARD_5 activity*
	

	// For each pathway, set up the variables. For each activity count the number of 
	// individuals in that activity for each month.
		forvalues y =1/120 { 
			egen work`c'cc`y' 	= count(activity`y') if activity`y'==1 & groupWARD_5==`c'
			egen worstu`c'cc`y' = count(activity`y') if activity`y'==2 & groupWARD_5==`c'
			egen study`c'cc`y' 	= count(activity`y') if activity`y'==3 & groupWARD_5==`c'
			egen nilf`c'cc`y' 	= count(activity`y') if activity`y'==4 & groupWARD_5==`c'
			egen unemp`c'cc`y' 	= count(activity`y') if activity`y'==5 & groupWARD_5==`c'
			}
			
	// There are two problems with these count variables. 
	// First, the count for an activity will be missing if the individual is not in that 
	// particular activity that month. The "minimum" command is used below to generate 
	// a new set of "count variables" which will include the minimum number -- either the 
	// count or the missing value (which Stata treats as a very large positive number)-- 
	// across all individuals for that month. So this step will 'replace' missing with
	// non-missing data.
		forvalues y =1/120 {
			egen work`c'c`y' 	= min(work`c'cc`y') 	if groupWARD_5==`c'
			egen worstu`c'c`y' 	= min(worstu`c'cc`y')	if groupWARD_5==`c' 
			egen study`c'c`y' 	= min(study`c'cc`y')	if groupWARD_5==`c' 
			egen nilf`c'c`y' 	= min(nilf`c'cc`y') 	if groupWARD_5==`c' 
			egen unemp`c'c`y' 	= min(unemp`c'cc`y') 	if groupWARD_5==`c'
			}
			// drop previous version of "count variables"
			drop work`c'cc* worstu`c'cc* study`c'cc* nilf`c'cc* unemp`c'cc* 
	
	// Second, some missings remain because there were no individuals in that activity for that month. 
	// This recode will change missings to 0's where there are non-missing counts for at least one other activity
	// for the same month (so, if there are genuine missing values for some reason, they will remain).
	forvalues y = 1/120 {
			recode work`c'c`y' 	(. = 0) if groupWARD_5==`c' & nilf`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | study`c'c`y'!=. 
			recode worstu`c'c`y'(. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | nilf`c'c`y'!=.   | study`c'c`y'!=. 
			recode study`c'c`y' (. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | nilf`c'c`y'!=. 
			recode nilf`c'c`y' 	(. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | unemp`c'c`y'!=. | worstu`c'c`y'!=. | study`c'c`y'!=.
			recode unemp`c'c`y' (. = 0) if groupWARD_5==`c' & work`c'c`y'!=. | nilf`c'c`y'!=.  | worstu`c'c`y'!=. | study`c'c`y'!=. 
			}
	
	// By pathway, generate percentage of individuals in each activity at each month. 
		forvalues y=1/120 {
			gen work`c'p`y'		= work`c'c`y'/	(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100
			gen worstu`c'p`y'	= worstu`c'c`y'/(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100
			gen study`c'p`y'	= study`c'c`y'/	(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100
			gen nilf`c'p`y'		= nilf`c'c`y'/	(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100
			gen unemp`c'p`y'	= unemp`c'c`y'/	(nilf`c'c`y'+ work`c'c`y' + unemp`c'c`y' + worstu`c'c`y' + study`c'c`y')*100
			}
			
	// drop "count variables"
			forvalues y=1/120 {
			drop work`c'c`y' worstu`c'c`y' study`c'c`y' nilf`c'c`y' unemp`c'c`y' 
			}
			
	// save as separate workbook for each pathway
	// select variables to be exported for the chronograph (execute from "preserve" to "restore" in one go)
	preserve
		drop if groupWARD_5!=`c'
		keep in 1 // only need the first observation as they all have the same data.
		keep xwaveid work`c'p* worstu`c'p* study`c'p* nilf`c'p* unemp`c'p*  
			
		// convert back to long format
		reshape long work`c'p worstu`c'p study`c'p nilf`c'p unemp`c'p, i(xwaveid) j(order) 
		drop xwaveid					
		save "youthpcentpath`c'.dta", replace  // save chronograph information for the pathway before
											   // moving on to the next pathway in the age segment
	restore	
		drop work`c'p* worstu`c'p* study`c'p* nilf`c'p* unemp`c'p* 
}
	
	// merge together information for all chronographs for the age segment and save in one dataset 
	use "youthpcentpath1.dta", clear 
	local g = 5
	forvalues c = 2/`g' {
		sort order 
		merge 1:1 order using "youthpcentpath`c'.dta"
		drop _merge
		}

// save:
save "youthpcent_merge.dta", replace


// save merged dataset in csv format so that data can be opened 
// in excel to create the chronographs
outsheet work* worstu* study* nilf* unemp* using "youthpcent_merge.out", ///
comma noquote nolabel replace