# Productivity Commission 2018 analysis for the Rising Inequality? Commission Research Paper # Longitudinal HILDA analysis ################################################################################# # ATTACH TO ALL OBSERVATIONS - LONGITUDINAL WEIGHTS 2001 THROUGH 2016 # create file with 2001 through 2016 weights for each enumerated and responding person dat2 <- dat %>% filter(year==2016) %>% select (xwaveid, lnwte, lnwtrp) %>% rename(long_wt=lnwte, long_wt_rp = lnwtrp) # join to main file and remove old object dat <- left_join(dat, dat2, by="xwaveid") rm(dat2) ########################################## # INCOME DECILES AND POVERTY ########################################## # weighted cut points cuts_long <- dat %>% group_by(year) %>% do(data.frame(t(wtd.quantile(.$eq_disp_inc, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>% ungroup() %>% rename(cut0=X..0., cut10=X.10., cut20=X.20., cut30=X.30., cut40=X.40., cut50=X.50., cut60=X.60., cut70=X.70., cut80=X.80., cut90=X.90., cut100=X100.) # add in 50% of income variable cuts_long <- mutate(cuts_long, half_median = 0.5*cut50) # join weighted cut points to the dataframe dat <- left_join(dat, cuts_long, by="year") # assign each observation to a decile basked depending on relevant weighted cut points dat <- dat %>% mutate(inc_dec = case_when( eq_disp_inc <= cut10 ~ 1, eq_disp_inc > cut10 & eq_disp_inc <= cut20 ~ 2, eq_disp_inc > cut20 & eq_disp_inc <= cut30 ~ 3, eq_disp_inc > cut30 & eq_disp_inc <= cut40 ~ 4, eq_disp_inc > cut40 & eq_disp_inc <= cut50 ~ 5, eq_disp_inc > cut50 & eq_disp_inc <= cut60 ~ 6, eq_disp_inc > cut60 & eq_disp_inc <= cut70 ~ 7, eq_disp_inc > cut70 & eq_disp_inc <= cut80 ~ 8, eq_disp_inc > cut80 & eq_disp_inc <= cut90 ~ 9, eq_disp_inc > cut90 ~ 10, TRUE ~ -99 ) ) # assign each observation a 1 if equivalised household disposable income less than half of median, 0 otherwise dat <- dat %>% mutate(inc_poverty = case_when( eq_disp_inc < half_median ~ 1, eq_disp_inc >= half_median ~ 0, TRUE ~ -99) ) ################################################################# # WEALTH DECILES ############################################################## # weighted cut points cuts_long_wealth <- dat %>% group_by(year) %>% filter(year %in% wealth_years) %>% do(data.frame(t(wtd.quantile(.$eq_wealth, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1)) ) ) ) %>% ungroup() %>% rename( wealth_cut0=X..0., wealth_cut10=X.10., wealth_cut20=X.20., wealth_cut30=X.30., wealth_cut40=X.40., wealth_cut50=X.50., wealth_cut60=X.60., wealth_cut70=X.70., wealth_cut80=X.80., wealth_cut90=X.90., wealth_cut100=X100.) # join weighted cut points to the dataframe dat <- left_join(dat, cuts_long_wealth, by="year") # assign each observation to a decile basked depending on relevant weighted cut points dat <- dat %>% mutate(wealth_dec = case_when( eq_wealth <= wealth_cut10 ~ 1, eq_wealth > wealth_cut10 & eq_wealth <= wealth_cut20 ~ 2, eq_wealth > wealth_cut20 & eq_wealth <= wealth_cut30 ~ 3, eq_wealth > wealth_cut30 & eq_wealth <= wealth_cut40 ~ 4, eq_wealth > wealth_cut40 & eq_wealth <= wealth_cut50 ~ 5, eq_wealth > wealth_cut50 & eq_wealth <= wealth_cut60 ~ 6, eq_wealth > wealth_cut60 & eq_wealth <= wealth_cut70 ~ 7, eq_wealth > wealth_cut70 & eq_wealth <= wealth_cut80 ~ 8, eq_wealth > wealth_cut80 & eq_wealth <= wealth_cut90 ~ 9, eq_wealth > wealth_cut90 ~ 10, TRUE ~ -99 ) ) ############################################################################# # CONSUMPTION ########################################################################### # CREATE CONSUMPTION DATAFRAME # drop those not appropriate for consumption - generate new consumption data frame # those before 2006 (and those with missing rent (<0) ) dat_cons <- dat %>% filter(year >= 2006) %>% subset(eq_rent >= 0) # CONSUMPTION DECILES # weighted cut points cuts_long_cons <- dat_cons %>% group_by(year) %>% do(data.frame(t(wtd.quantile(.$eq_cons, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1)) ) ) ) %>% ungroup() %>% rename(cons_cut0=X..0., cons_cut10=X.10., cons_cut20=X.20., cons_cut30=X.30., cons_cut40=X.40., cons_cut50=X.50., cons_cut60=X.60., cons_cut70=X.70., cons_cut80=X.80., cons_cut90=X.90., cons_cut100=X100.) # join weighted cut points to the dataframe dat_cons <- left_join(dat_cons, cuts_long_cons, by="year") # assign each observation to a decile basked depending on relevant weighted cut points dat_cons <- dat_cons %>% mutate(cons_dec = case_when( eq_cons <= cons_cut10 ~ 1, eq_cons > cons_cut10 & eq_cons <= cons_cut20 ~ 2, eq_cons > cons_cut20 & eq_cons <= cons_cut30 ~ 3, eq_cons > cons_cut30 & eq_cons <= cons_cut40 ~ 4, eq_cons > cons_cut40 & eq_cons <= cons_cut50 ~ 5, eq_cons > cons_cut50 & eq_cons <= cons_cut60 ~ 6, eq_cons > cons_cut60 & eq_cons <= cons_cut70 ~ 7, eq_cons > cons_cut70 & eq_cons <= cons_cut80 ~ 8, eq_cons > cons_cut80 & eq_cons <= cons_cut90 ~ 9, eq_cons > cons_cut90 ~ 10, TRUE ~ -99 ) ) # join consumption deciles to main frame cons_join <- dat_cons %>% select(xwaveid, year, cons_dec) dat <- left_join(dat, cons_join, by=c("xwaveid","year")) rm(cons_join) ######################################################################################################## # BEGINNING OF OPTIONAL ANALYSIS - CAN BE DONE BIT BY BIT ###################################################################################################### ######################################################### # MOBILITY ########################################################### ############# # MOVEMENT BETWEEN INCOME DECILES - MOBILITY GRID - TABLE 5.1 (INCOME) inc_path0116 <- as.data.frame(10*prop(wtd.table(dat$inc_dec[dat$year == 2001], dat$inc_dec[dat$year == 2016], dat$long_wt[dat$year == 2016]))) colnames(inc_path0116) <- c("inc_dec01", "inc_dec16", "per_cent") inc_path0116 <- inc_path0116 %>% filter(inc_dec01!="Total", inc_dec16!="Total") # for table - wide to long inc_path0116 %>% spread(inc_dec16, per_cent) ############# # MOVEMENT BETWEEN WEALTH DECILES - MOBILITY GRID - TABLE 5.2 (WEALTH) wealth_path0214_B <- as.data.frame(10*prop(wtd.table(dat$wealth_dec[dat$year == 2002], dat$wealth_dec[dat$year == 2014], dat$long_wt[dat$year == 2016]))) colnames(wealth_path0214_B) <- c("wealth_dec02", "wealth_dec14", "per_cent") wealth_path0214_B <- wealth_path0214_B %>% filter(wealth_dec02!="Total", wealth_dec14!="Total") # for table - wide to long wealth_path0214_B %>% spread(wealth_dec14, per_cent) # percentage of people that are more than $6000 above the top income bracket (>$96000) # 'but most of those in the top decile would remain in the top decile if they incurred the same drop in income' (in text, chap 5) dat %>% select(hhwt, eq_disp_inc) %>% mutate(stuck_high = ifelse(eq_disp_inc > 96000,1,0) ) %>% summarise(perc = sum(hhwt*stuck_high)/sum(hhwt) ) #################################### # Sankey/Alluvial diagrams - Figure 5.5 (main Sankey diagram) # get data in right format - all deciles alluvial dat_al <- dat %>% select(xwaveid, year, inc_dec, long_wt) %>% filter(year==2001 | year==2016) %>% spread(key=year, value=inc_dec) %>% rename(d2001='2001', d2016='2016') %>% group_by(d2001, d2016) %>% summarise(pop = sum(long_wt) ) # emf(file = here("HILDA", "charts", "alluvial_dec_hide.emf"), width = 5.90551, height = 7.7) # alluvial with all deciles - only three colours showing, rest white (for report - tall version) alluvial(dat_al[,1:2], freq=dat_al$pop, hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9), col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow, dat_al$d2001==5 ~ alluvial_blue, dat_al$d2001==10 ~ alluvial_green, TRUE ~ grey) , border=NA, alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2015-16"), cw=0.05, xw=0.4) # dev.off() # same chart - for presentation on slide (couldn't get it for dark background though) - wide version. different font. # emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation.emf"), width = 9.5, height = 5.5) # svg(file = here("HILDA", "charts", "alluvial_dec_hide_presentation.svg")) # alluvial with all deciles - only three colours showing, rest white alluvial(dat_al[,1:2], freq=dat_al$pop, hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9), col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow, dat_al$d2001==5 ~ alluvial_blue, dat_al$d2001==10 ~ alluvial_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2015-16"), cex = 1.5, # font size cex.axis = 1.5, # font size labels cw=0.08, xw=0.4) # dev.off() # just top decile # emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation_top_only.emf"), width = 9.5, height = 5.5) alluvial(dat_al[,1:2], freq=dat_al$pop, hide = dat_al$d2001 %in% c(1, 2,3,4,5,6,7,8,9), col = case_when( #dat_al$d2001 == 1 ~ alluvial_yellow, # dat_al$d2001==5 ~ alluvial_blue, dat_al$d2001==10 ~ alluvial_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2015-16"), cex = 1.5, # font size cex.axis = 1.5, # font size labels cw=0.08, xw=0.4) # dev.off() # just top and bottom # emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation_top_and_bottom.emf"), width = 9.5, height = 5.5) alluvial(dat_al[,1:2], freq=dat_al$pop, hide = dat_al$d2001 %in% c(2,3,4,5, 6,7,8,9), col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow, # dat_al$d2001==5 ~ alluvial_blue, dat_al$d2001==10 ~ alluvial_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2015-16"), cex = 1.5, # font size cex.axis = 1.5, # font size labels cw=0.08, xw=0.4) # dev.off() # all ten deciles - taste the rainbow! :) # emf(file = here("HILDA", "charts", "alluvial_dec_presentation_10.emf"), width = 9.5, height = 5.5) alluvial(dat_al[,1:2], freq=dat_al$pop, # hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9), col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow, dat_al$d2001==5 ~ alluvial_blue, dat_al$d2001==10 ~ alluvial_green, dat_al$d2001==2 ~ orange, dat_al$d2001==3 ~ red, dat_al$d2001==4 ~ purple, dat_al$d2001==6 ~ mid_blue, dat_al$d2001==7 ~ dark_blue, dat_al$d2001==8 ~ dark_green, dat_al$d2001==9 ~ PC_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2015-16"), cex = 1.5, # font size cex.axis = 1.5, # font size labels cw=0.08, xw=0.4) # dev.off() ####### # Sankey/Alluvial - Absolute income values - 3 categories, fixed levels (Absolute mobility v relative mobility) - Figure 5.10 # get data sorted dat_quin_abs <- dat %>% select(xwaveid, year, eq_disp_inc, long_wt) %>% mutate(inc_quin = case_when( eq_disp_inc <30000 ~ "$30k and less", eq_disp_inc >=30000 & eq_disp_inc <50000 ~ "$30k to $50k", eq_disp_inc >=50000 ~ "$50k and up", TRUE ~ "Problem" ) ) %>% select(-eq_disp_inc) %>% filter(year==2001 | year==2016) %>% spread(key=year, value=inc_quin) %>% rename(d2001='2001', d2016='2016') %>% group_by(d2001, d2016) %>% summarise(pop = sum(long_wt) ) # emf(file = here("HILDA", "charts", "alluvial_abs.emf"), width = 5.90551, height = 3.14961) # chart alluvial(dat_quin_abs[,1:2], freq=dat_quin_abs$pop, col = case_when(dat_quin_abs$d2001 == "$30k and less" ~ "#A52828", dat_quin_abs$d2001=="$30k to $50k" ~ "#66BCDB", dat_quin_abs$d2001=="$50k and up" ~ "#265A9A", TRUE ~ "grey") , border=NA, alpha=0.7, axis_labels = c("2000-01","2015-16"), gap.width=0.0, # relative width of inter-category gaps xw=0.3, # the distance from the set axis to the control points of the xspline cw=0.12, # width of the category axis cex=1) # font size of category labels # dev.off() # using this dataset - how many people END UP in each category? - under $30,000 = 22.7%, $30,000 to $50,000 = 31.6%, more than $50,000 = 45.6% dat_quin_abs %>% group_by(d2016) %>% mutate(total_pop = sum(pop) ) %>% ungroup() %>% mutate(perc = 3*total_pop/sum(total_pop)) %>% arrange(d2016) %>% View() # using this dataset - how many people START in each category? - under $30,000 = 33.6%, $30,000 to $50,000 = 38.7%, more than $50,000 = 27.6% dat_quin_abs %>% group_by(d2001) %>% mutate(total_pop = sum(pop) ) %>% ungroup() %>% mutate(perc = 3*total_pop/sum(total_pop)) %>% arrange(d2001) %>% View() ############################ # US alluvial for income mobility - From 'Fisher et al 2016 Inequaltiy and Mobility using income, consumption, wealth for the same individuals' (p. 54) # NOTES: # restricted to adults who are between 25 and 50 in 1999 (p. 53) # income is reported for the previous taxable year, measures are in 2013 constant dollars, equivalence scale given by square root of family size, (p. 49) # we use family level file - but other measures are at individual level (p. 49). Also say 'individual mobility' (p. 53) and talk about 53 percent of people' (p. 53, # so on balance, I think they are using individual level for mobility. US_al <- read.csv(file=here("US intragenerational income mobility, Fisher et al 2016.csv")) # emf(file = here("HILDA", "charts", "alluvial_US.emf"), width = 2.95, height = 3.14961) # svg(file = here("HILDA", "charts", "alluvial_US.svg")) alluvial(US_al[,1:2], freq=US_al$pop, hide = US_al$q1999 %in% c(2,4), col = case_when( US_al$q1999 == 1 ~ alluvial_yellow, US_al$q1999==3 ~ alluvial_blue, US_al$q1999==5 ~ alluvial_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("1998-99","2012-13"), cex = 1, # font size cex.axis = 1, # font size labels cw=0.08, xw=0.4) # dev.off() ################## # Australian version to match up with the US (2000-01 to 2014-15, closest matching time period) # dat <- mutate(dat, sqrt_eq_disp_inc = hh_disp_inc/square_root_eq) # INCOME DECILES - SQUARE ROOT EQUIVALENCE SCALE # weighted cut points cuts_long_sqrt <- dat %>% filter( (year==2001 & (age>25 & age<50) ) | (year==2015 & (age>39 & age<64) ) | year %in% c(2002:2014) | year==2016 ) %>% # for 2001 and 2015, only use people within age groups group_by(year) %>% do(data.frame(t(wtd.quantile(.$sqrt_eq_disp_inc, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>% ungroup() %>% rename(cut_sqrt0=X..0., cut_sqrt10=X.10., cut_sqrt20=X.20., cut_sqrt30=X.30., cut_sqrt40=X.40., cut_sqrt50=X.50., cut_sqrt60=X.60., cut_sqrt70=X.70., cut_sqrt80=X.80., cut_sqrt90=X.90., cut_sqrt100=X100.) # join weighted cut points to the dataframe dat <- left_join(dat, cuts_long_sqrt, by="year") # assign each observation to a decile basked depending on relevant weighted cut points dat <- dat %>% mutate(sqrt_inc_dec = case_when( sqrt_eq_disp_inc <= cut_sqrt10 ~ 1, sqrt_eq_disp_inc > cut_sqrt10 & sqrt_eq_disp_inc <= cut_sqrt20 ~ 2, sqrt_eq_disp_inc > cut_sqrt20 & sqrt_eq_disp_inc <= cut_sqrt30 ~ 3, sqrt_eq_disp_inc > cut_sqrt30 & sqrt_eq_disp_inc <= cut_sqrt40 ~ 4, sqrt_eq_disp_inc > cut_sqrt40 & sqrt_eq_disp_inc <= cut_sqrt50 ~ 5, sqrt_eq_disp_inc > cut_sqrt50 & sqrt_eq_disp_inc <= cut_sqrt60 ~ 6, sqrt_eq_disp_inc > cut_sqrt60 & sqrt_eq_disp_inc <= cut_sqrt70 ~ 7, sqrt_eq_disp_inc > cut_sqrt70 & sqrt_eq_disp_inc <= cut_sqrt80 ~ 8, sqrt_eq_disp_inc > cut_sqrt80 & sqrt_eq_disp_inc <= cut_sqrt90 ~ 9, sqrt_eq_disp_inc > cut_sqrt90 ~ 10, TRUE ~ -99 ) ) # turn deciles into quintiles dat <- dat %>% mutate(sqrt_inc_quin = case_when( sqrt_inc_dec %in% c(1,2) ~ 1, sqrt_inc_dec %in% c(3,4) ~ 2, sqrt_inc_dec %in% c(5,6) ~ 3, sqrt_inc_dec %in% c(7,8) ~ 4, sqrt_inc_dec %in% c(9,10) ~ 5, TRUE ~ -99 ) ) # get data sorted for alluvial chart # dat_al_sqrt_quin <- dat %>% # select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% # filter((year==2001 & (age>25 & age<50) ) | (year==2015 & (age>39 & age<64) ) ) %>% View() # using these years as closest match to US data. # spread(key=year, value=sqrt_inc_quin) %>% # rename(q2001='2001', q2015='2015') %>% quin2001 <- dat %>% select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% filter((year==2001 & (age>25 & age<50) ) ) %>% rename(q2001 = sqrt_inc_quin) %>% select(xwaveid, q2001) quin2015 <- dat %>% select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% filter((year==2015 & (age>39 & age<64)) ) %>% rename(q2015 = sqrt_inc_quin) %>% select( xwaveid, q2015, long_wt ) dat_al_sqrt_quin <- left_join(quin2001, quin2015, by="xwaveid") # calculations - population and percentage dat_al_sqrt_quin <- dat_al_sqrt_quin %>% group_by(q2001, q2015) %>% summarise(pop = sum(long_wt) ) %>% ungroup() %>% group_by(q2001) %>% mutate(perc = 100*pop/sum(pop)) # emf(file = here("HILDA", "charts", "alluvial_Aus_match_US.emf"), width = 2.95, height = 3.14961) # svg(file = here("HILDA", "charts", "alluvial_Aus_match_US.svg")) # alluvial chart - Australi to match the US alluvial(dat_al_sqrt_quin[,1:2], freq=dat_al_sqrt_quin$pop, hide = dat_al_sqrt_quin$q2001 %in% c(2,4), col = case_when( dat_al_sqrt_quin$q2001 == 1 ~ alluvial_yellow, dat_al_sqrt_quin$q2001==3 ~ alluvial_blue, dat_al_sqrt_quin$q2001==5 ~ alluvial_green, TRUE ~ grey) , border=NA, # border on ribbons alpha=0.7, # gap.width=0, axis_labels = c("2000-01","2014-15"), cex = 1, # font size cex.axis = 1, # font size labels cw=0.08, xw=0.4) # dev.off() ############################################# # MOBILITY STATS ##### # how many people spend at least one period in the top income decile pop_top <- dat %>% select(xwaveid, long_wt, year, inc_dec) %>% mutate(top = ifelse(inc_dec==10, 1, 0) ) %>% select(-inc_dec) %>% arrange(xwaveid, year) %>% mutate(tot=cumsum(top)) %>% filter(year==2016) %>% mutate(num_top = tot-lag(tot)) %>% mutate(winner = ifelse(num_top>0, 1, 0)) %>% filter(winner==1) %>% summarise(sum(long_wt)) tot_pop <- dat %>% filter(year==2016) %>% summarise(sum(long_wt)) pop_top/tot_pop ##### # how many people spend at least one period in the bottom decile pop_bottom <- dat %>% select(xwaveid, long_wt, year, inc_dec) %>% mutate(bottom = ifelse(inc_dec==1, 1, 0) ) %>% select(-inc_dec) %>% arrange(xwaveid, year) %>% mutate(tot=cumsum(bottom)) %>% filter(year==2016) %>% mutate(num_bottom = tot-lag(tot)) %>% mutate(one_bottom = ifelse(num_bottom>0, 1, 0)) %>% filter(one_bottom==1) %>% summarise(sum(long_wt)) tot_pop <- dat %>% filter(year==2016) %>% summarise(sum(long_wt)) pop_bottom/tot_pop ##### # how many people spent at least one period in the top and one period in the bottom deciles - (in chapter 3) pop_top_bottom <- dat %>% select(xwaveid, long_wt, year, inc_dec) %>% mutate(bottom = ifelse(inc_dec==1, 1, 0), top = ifelse(inc_dec==10, 1, 0) ) %>% select(-inc_dec) %>% arrange(xwaveid, year) %>% mutate(tot_bottom=cumsum(bottom), tot_top = cumsum(top) ) %>% filter(year==2016) %>% mutate(num_bottom = tot_bottom - lag(tot_bottom), num_top = tot_top - lag(tot_top) ) %>% mutate(top_bottom = ifelse(num_bottom>0 & num_top>0, 1, 0)) %>% filter(top_bottom==1) %>% summarise(sum(long_wt)) tot_pop <- dat %>% filter(year==2016) %>% summarise(sum(long_wt)) pop_top_bottom/tot_pop ############### # how many income deciles does each person inhabit over the whole period? - INCOME (in key points) # on average dat %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_inc_dec = n_distinct(inc_dec)) %>% ungroup() %>% summarise(avg_inc_dec = wtd.mean(n_inc_dec, long_wt) ) # how many people stay in the same decile, by starting income decile dat %>% arrange(xwaveid, year) %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_inc_dec = n_distinct(inc_dec), inc_dec01 = first(inc_dec)) %>% ungroup() %>% group_by(inc_dec01, n_inc_dec) %>% summarise(group_total = sum(long_wt)) %>% ungroup() %>% group_by(inc_dec01) %>% mutate(perc = 100*group_total/sum(group_total)) %>% View() # how many people were in both top income and top wealth decile in all years entrenched_advantage <- dat %>% select(xwaveid, long_wt, year, inc_dec, wealth_dec) %>% group_by(xwaveid, long_wt) %>% filter(n_distinct(inc_dec)==1, inc_dec==10) %>% ungroup() %>% filter(year %in% wealth_years) %>% group_by(xwaveid, long_wt) %>% filter(n_distinct(wealth_dec)==1, wealth_dec==10) %>% ungroup() %>% filter(year==2014) %>% summarise(group_total = sum(long_wt) ) tot_pop <- dat %>% filter(year==2014) %>% summarise(sum(long_wt)) entrenched_advantage/tot_pop # how many people spent at least one year in a decile (by decile) dat %>% select(xwaveid, long_wt, inc_dec, year) %>% group_by(year) %>% mutate(tot = sum(long_wt)) %>% ungroup() %>% group_by(inc_dec) %>% distinct(xwaveid, .keep_all=TRUE) %>% summarise(group_total = sum(long_wt)/ mean(tot) ) %>% View() ############ # how many WEALTH deciles does each person inhabit over the whole period? - WEALTH # how many people stay in the same decile, by starting wealth deciles dat %>% filter(year %in% wealth_years) %>% arrange(xwaveid, year) %>% select(xwaveid, long_wt, year, wealth_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_wealth_dec = n_distinct(wealth_dec), wealth_dec02 = first(wealth_dec)) %>% ungroup() %>% group_by(wealth_dec02, n_wealth_dec) %>% summarise(group_total = sum(long_wt)) %>% ungroup() %>% group_by(wealth_dec02) %>% mutate(perc = 100*group_total/sum(group_total)) %>% View() ####################################################### # what is the SPAN of income decile inhabited by each person (use View to see numbers) - Figure 5.4 # emf(file = here("HILDA", "charts", "span_income.emf"), width = 5.90551, height = 3.14961) dat %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(span=max(inc_dec)-min(inc_dec)) %>% ungroup() %>% group_by(span) %>% summarise(group_total = sum(long_wt) ) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ggplot(aes(x=span, y=perc)) + geom_col(fill=dark_blue) + scale_y_continuous(name="Per cent", limits = c(0, 20), expand = c(0.0001, 0.0001)) + scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) + theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm") ) # dev.off() # for presentation on dark background # emf(file = here("HILDA", "charts", "span_income_presentation.emf"), width = 5.90551, height = 3.14961) dat %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(span=max(inc_dec)-min(inc_dec)) %>% ungroup() %>% group_by(span) %>% summarise(group_total = sum(long_wt) ) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ggplot(aes(x=span, y=perc)) + geom_col(fill=blue) + scale_y_continuous(name="Per cent", limits = c(0, 20), expand = c(0.0001, 0.0001)) + scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) + theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), axis.text.x = element_text(colour=white), axis.text.y = element_text(colour=white), axis.title.x = element_text(colour=white), axis.title.y = element_text(colour=white)) # dev.off() # what is the SPAN - WEALTH - histogram - (use View to see numbers) - Figure 5.6 # emf(file = here("HILDA", "charts", "span_wealth_presentation.emf"), width = 5.90551, height = 3.14961) dat %>% select(xwaveid, long_wt, year, wealth_dec) %>% filter(year %in% wealth_years) %>% group_by(xwaveid, long_wt) %>% summarise(span=max(wealth_dec)-min(wealth_dec)) %>% ungroup() %>% group_by(span) %>% summarise(group_total = sum(long_wt) ) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ggplot(aes(x=span, y=perc)) + geom_col(fill=blue) + scale_y_continuous(name="Per cent", limits = c(0, 30), expand = c(0.0001, 0.0001)) + scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) + # geom_hline(yintercept = 0, colour = grey) + theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), axis.text.x = element_text(colour=white), axis.text.y = element_text(colour=white), axis.title.x = element_text(colour=white), axis.title.y = element_text(colour=white) ) # dev.off() # same chart - for presentation with dark background # what is the SPAN - WEALTH - histogram - (use View to see numbers) - Figure 5.6 # emf(file = here("HILDA", "charts", "span_wealth.emf"), width = 5.90551, height = 3.14961) dat %>% select(xwaveid, long_wt, year, wealth_dec) %>% filter(year %in% wealth_years) %>% group_by(xwaveid, long_wt) %>% summarise(span=max(wealth_dec)-min(wealth_dec)) %>% ungroup() %>% group_by(span) %>% summarise(group_total = sum(long_wt) ) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ggplot(aes(x=span, y=perc)) + geom_col(fill=dark_blue) + scale_y_continuous(name="Per cent", limits = c(0, 30), expand = c(0.0001, 0.0001)) + scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) + # geom_hline(yintercept = 0, colour = grey) + theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm") ) # dev.off() # span of income but only looking at wealth years, for comparison with wealth (in text, chap 5) # "40 per cent had a difference of three or more deciles between the top and bottom wealth deciles they spent time in, # while 60 per cent of people had a difference of three or more deciles between the top and bottom income deciles they spent time in" dat %>% select(xwaveid, long_wt, year, inc_dec) %>% filter(year %in% wealth_years) %>% group_by(xwaveid, long_wt) %>% summarise(span=max(inc_dec)-min(inc_dec)) %>% ungroup() %>% group_by(span) %>% summarise(group_total = sum(long_wt) ) %>% mutate(perc = 100*group_total/sum(group_total)) %>% arrange(desc(span)) %>% mutate(culm_perc = cumsum(perc)) %>% View() # how many people stay in the same decile, by starting income decile (ONLY WEALTH YEARS) (for in text comparison) # "People were also twice as likely to remain in the same wealth decile over this period than they were to remain in the same income decile" # income deciles dat %>% arrange(xwaveid, year) %>% filter(year %in% wealth_years) %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_inc_dec = n_distinct(inc_dec), inc_dec01 = first(inc_dec)) %>% ungroup() %>% group_by(inc_dec01, n_inc_dec) %>% summarise(group_total = sum(long_wt)) %>% ungroup() %>% group_by(inc_dec01) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ungroup() %>% arrange(n_inc_dec) %>% mutate(culm_perc = cumsum(perc)) %>% View() # how many people stay in the same decile, by starting wealth deciles dat %>% filter(year %in% wealth_years) %>% arrange(xwaveid, year) %>% select(xwaveid, long_wt, year, wealth_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_wealth_dec = n_distinct(wealth_dec), wealth_dec02 = first(wealth_dec)) %>% ungroup() %>% group_by(wealth_dec02, n_wealth_dec) %>% summarise(group_total = sum(long_wt)) %>% ungroup() %>% group_by(wealth_dec02) %>% mutate(perc = 100*group_total/sum(group_total)) %>% ungroup() %>% arrange(n_wealth_dec) %>% mutate(culm_perc = cumsum(perc)) %>% View() # income deciles - all years - how many stuck in the bottom two deciles - 2.77% (exec summary, in text) bottom_2_inc <- dat %>% arrange(xwaveid, year) %>% select(xwaveid, long_wt, year, inc_dec) %>% group_by(xwaveid, long_wt) %>% summarise(n_inc_dec = n_distinct(inc_dec), max_dec = max(inc_dec) ) %>% ungroup() %>% filter(n_inc_dec <= 2) %>% filter(max_dec <= 2) %>% summarise(group_total = sum(long_wt)) tot_pop <- dat %>% filter(year==2016) %>% summarise(sum(long_wt)) bottom_2_inc/tot_pop ################################################## # MOBILITY - RETIREMENT AND COUPLING/DECOUPLING (Table 5.3) ######################################## # Retirement - association with decile changes (Table 5.3) # change factor to integer dat <- mutate(dat, just_retired = as.integer(lertr) - 11) # 0 = No, not just retired # 1 = yes, just retired # comparing those that retired in the last year, and those that didn't (over 18) - total amounts filling - numbers (percents) - responding person - big or small shift dat %>% select(xwaveid, year, long_wt_rp, age, inc_dec, just_retired, rp) %>% arrange(xwaveid, year) %>% mutate(inc_shift = lead(inc_dec, 2) - lag(inc_dec, 1)) %>% # numbers determine how many periods to lead or lag - retirement may effect income in this period and one period into the future (as income recorded for previous financial year) filter(year %in% seq(2002, 2014) ) %>% # - need to adjust year to take account of lag/lead (so don't grab inc_dec from different people) filter(just_retired == 1 | just_retired == 0) %>% # select on those that answer either yes or no to 'just retired' filter(age>=18) %>% filter(rp==1) %>% mutate(shift_size = case_when( inc_shift >= 4 ~ "big up", inc_shift < 4 & inc_shift > 0 ~ "small up", inc_shift == 0 ~ "no shift", inc_shift <= -4 ~ "big down", inc_shift > -4 & inc_shift < 0 ~ "small down", TRUE ~ "wrong") ) %>% group_by(shift_size, just_retired) %>% summarise(group_total = sum(long_wt_rp)) %>% ungroup() %>% group_by(shift_size) %>% mutate(perc = group_total/sum(group_total)) %>% View() ########################################### # Coupling/Decoupling - association with decile changes (Table 5.3 and preceding para) # mrcurr variable seems to be the best quality variable if i go to the effort of calculating my own (it is derived from other answers, and includes married and defacto for all years) # https://www.online.fbe.unimelb.edu.au/HILDAodd/KWCrossWaveCategoryDetails.aspx?varnt=mrcurr dat <- dat %>% mutate(mar_status = as.integer(mrcurr) - 10) # small number of refused and don't know # 1 = legally married # 2 = de facto # 3 = separated # 4 = divorced # 5 = widowed # 6 = never married and not defacto # change in marriage status with previous year dat <- dat %>% arrange(xwaveid, year) %>% mutate(mar_change = case_when( mar_status>=3 & lag(mar_status)<=2 ~ "decoupled", # de-coupled in past year mar_status<=2 & lag(mar_status)>=3 ~ "coupled", TRUE ~ "no change") ) # calculate change in mar status between wealth years dat_wealth_shift <- dat %>% arrange(xwaveid, year) %>% filter(year %in% wealth_years) %>% mutate(mar_change_wealth = case_when( mar_status>=3 & lag(mar_status)<=2 ~ "decoupled", # de-coupled in past year mar_status<=2 & lag(mar_status)>=3 ~ "coupled", TRUE ~ "no change") ) # percentage of people experiencing given INCOME shift who coupled/decouples/no change (Table 5.3) dat %>% filter(rp==1) %>% select(xwaveid, year, inc_dec, long_wt_rp, mar_change) %>% arrange(xwaveid, year) %>% mutate(inc_shift = lead(inc_dec, 2) - lag(inc_dec, 1)) %>% # numbers determine how many periods to lead or lag - retirement may effect income in this period and one period into the future filter(year %in% seq(2002, 2014) ) %>% # - need to adjust year to take account of lag/lead (so don't grab inc_dec from different people) mutate(shift_size = case_when( inc_shift >= 4 ~ "big up", inc_shift < 4 & inc_shift > 0 ~ "small up", inc_shift == 0 ~ "no shift", inc_shift <= -4 ~ "big down", inc_shift > -4 & inc_shift < 0 ~ "small down", TRUE ~ "wrong") ) %>% group_by(shift_size, mar_change) %>% summarise(group_total = sum(long_wt_rp)) %>% ungroup() %>% group_by(shift_size) %>% mutate(perc = group_total/sum(group_total)) %>% View() # percentage of people experiencing given WEALTH shift who coupled/decouples/no change (Table 5.3) dat_wealth_shift %>% filter(rp==1) %>% select(xwaveid, year, age, wealth_dec, long_wt_rp, mar_change_wealth) %>% arrange(xwaveid, year) %>% filter(year %in% wealth_years ) %>% mutate(wealth_shift = wealth_dec - lag(wealth_dec)) %>% # numbers determine how many periods to lead or lag - retirement may effect wealthome in this period and one period into the future filter(age>=18, year!=2002) %>% mutate(shift_size = case_when( wealth_shift >= 4 ~ "big up", wealth_shift < 4 & wealth_shift > 0 ~ "small up", wealth_shift == 0 ~ "no shift", wealth_shift <= -4 ~ "big down", wealth_shift > -4 & wealth_shift < 0 ~ "small down", TRUE ~ "wrong") ) %>% group_by(shift_size, mar_change_wealth) %>% summarise(group_total = sum(long_wt_rp)) %>% ungroup() %>% group_by(shift_size) %>% mutate(perc = group_total/sum(group_total)) %>% View() ################################################################################## # GINIS - average of 1 year ginis, and multiple year ginis. - 'How do we compare internationally in life course mobility?' section # eq_disp_inc average GINI across 16 years.- 0.302 dat %>% select(year, long_wt, eq_disp_incp) %>% # only positive values group_by(year) %>% summarise(gini = gini(eq_disp_incp, weights = long_wt)) %>% ungroup() %>% summarise(avg = mean(gini)) # GINI for income summed across 16 years - 0.249 dat %>% select(xwaveid, long_wt, eq_disp_incp) %>% # only positive values group_by(xwaveid) %>% summarise(tot_inc = sum(eq_disp_incp), avg_long_wt = mean(long_wt)) %>% ungroup() %>% summarise(gini = gini(tot_inc, weights = avg_long_wt)) # CONSUMPTION average GINI across 10 years - 0.251 dat %>% select(year, long_wt, eq_cons) %>% filter(year >= 2006, !is.na(eq_cons)) %>% group_by(year) %>% summarise(gini = gini(eq_cons, weights = long_wt)) %>% ungroup() %>% summarise(avg = mean(gini)) # GINI for consumption summed across 16 years - 0.221 dat %>% filter(year >= 2006) %>% select(xwaveid, long_wt, eq_cons) %>% # only positive values group_by(xwaveid) %>% summarise(tot_cons = sum(eq_cons, na.rm=FALSE), # discard people with na for consumption in any year avg_long_wt = mean(long_wt)) %>% ungroup() %>% filter(!is.na(tot_cons)) %>% summarise(gini = gini(tot_cons, weights = avg_long_wt)) ################## Absolute v relative mobility section # how many people are earning more in 2016 than in 2001? - 69% dat %>% select(xwaveid, year, long_wt, eq_disp_inc, age) %>% filter(year %in% c(2001, 2016) ) %>% arrange(xwaveid, year) %>% mutate(inc_increase = ifelse(eq_disp_inc > lag(eq_disp_inc), 1, 0) ) %>% filter(year==2016) %>% # filter(age < 58) %>% summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) ) # consumption - 2006 to 2016 - 59.8% dat %>% select(xwaveid, year, long_wt, eq_cons, age) %>% filter(year %in% c(2006, 2016) ) %>% arrange(xwaveid, year) %>% mutate(inc_increase = ifelse(eq_cons > lag(eq_cons), 1, 0) ) %>% filter(year==2016) %>% # filter(age < 58) %>% summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) ) # wealth - 2002 to 2014 - 71.1% dat %>% select(xwaveid, year, long_wt, eq_wealth, age) %>% filter(year %in% c(2002, 2014) ) %>% arrange(xwaveid, year) %>% mutate(inc_increase = ifelse(eq_wealth > lag(eq_wealth), 1, 0) ) %>% filter(year==2014) %>% # filter(age < 58) %>% summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) ) ################################################### # CHAPTER 4 ################################################# ########## # WEALTH ALL AGES by decade (AVERAGE) - Figure 4.13 # emf(file = here("HILDA","charts","mean_wealth_birth_decade.emf"), width = 5.90551, height = 3.14961) dat %>% group_by(year, birth_decade) %>% filter(year %in% wealth_years, !is.na(birth_decade), birth_decade!="1910s") %>% summarise(wealth = weighted.mean(eq_wealth, long_wt), avg_age = weighted.mean(age, long_wt)) %>% # used average of actual age. ggplot( aes(x = avg_age, y = wealth, group = birth_decade, col = birth_decade))+ geom_line(size=1.05)+ geom_point(size=2)+ scale_y_continuous(name="Dollars", expand=c(0,0), limits = c(0,1000000), labels = comma_format(digits = 0) )+ scale_x_continuous(expand = c(0.03, 0.03), breaks = c(seq(25,90,5)), limits = c(23,90))+ scale_color_manual(values=PCcols)+ theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), legend.position = "none", legend.title.align = 0.5, legend.title = element_text()) + labs(x = "Average age of birth cohort", y= "Dollars", col = paste0("Decade",'\n', "of birth")) # dev.off() ################################################################ # POVERTY PERSISTANCE - CHAPTER 6 ############################################################### # create poverty dataset and put in wide format dat_pov <- dat %>% select(xwaveid, year, long_wt, inc_poverty) %>% spread(year, inc_poverty) # rename years so non-numeric and easier to work with dat_pov <- rename(dat_pov, y2001='2001', y2002='2002', y2003='2003', y2004='2004',y2005='2005', y2006='2006', y2007='2007', y2008='2008' , y2009='2009', y2010='2010', y2011='2011', y2012='2012', y2013='2013', y2014='2014', y2015='2015', y2016='2016') # data in long format dat_pov2 <- dat %>% select(xwaveid, year, long_wt, inc_poverty) # how many in poverty in each year num_pov <- dat_pov2 %>% group_by(year) %>% summarise(average = wtd.mean(x=inc_poverty, weight=long_wt) ) # total population for this dataset tot_pop <- dat_pov %>% select(long_wt) %>% sum() # total years spent in poverty out of 16 years dat_pov <- dat_pov %>% mutate(pov_total = rowSums(select(., y2001:y2016 ) ) ) # average number of years spent in poverty - counting only those with at least one year spent in poverty dat_pov %>% filter(pov_total>=1) %>% summarise(wtd.mean(pov_total, weight=long_wt)) # perc spending at least 1 year in poverty one_plus <- dat_pov %>% filter(pov_total>=1) %>% select(long_wt) %>% sum() # percentage one_plus/tot_pop # perc spending at least 5 years in poverty total five_plus <- dat_pov %>% filter(pov_total>=5) %>% select(long_wt) %>% sum() # percentage five_plus/tot_pop # percent spending 5 consecutive years in poverty i = 2001 dat_pov <- mutate(dat_pov, run_01 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2002 dat_pov <- mutate(dat_pov, run_02 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2003 dat_pov <- mutate(dat_pov, run_03 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2004 dat_pov <- mutate(dat_pov, run_04 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2005 dat_pov <- mutate(dat_pov, run_05 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2006 dat_pov <- mutate(dat_pov, run_06 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2007 dat_pov <- mutate(dat_pov, run_07 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2008 dat_pov <- mutate(dat_pov, run_08 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2009 dat_pov <- mutate(dat_pov, run_09 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2010 dat_pov <- mutate(dat_pov, run_10 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2011 dat_pov <- mutate(dat_pov, run_11 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) i = 2012 dat_pov <- mutate(dat_pov, run_12 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) ) rm(i) dat_pov <- mutate(dat_pov, pov_run = case_when ( run_01 ==5 | run_02 ==5 | run_03 ==5 | run_04 ==5 | run_05 ==5 | run_06 ==5 | run_07 ==5 | run_08 ==5 | run_09 ==5 | run_10 ==5 | run_11 ==5 | run_12==5 ~ 1, TRUE ~ 0 ) ) five_plus_run <- dat_pov %>% filter(pov_run ==1) %>% select(long_wt) %>% sum() five_plus_run/tot_pop # percent who have been in poverty for this year and the previous 3 years. last 4 years - 3.57% (exec summary) dat_pov <- mutate(dat_pov, last_4_run = ifelse(y2013==1 & y2014==1 & y2015==1 & y2016==1, 1, 0)) last_4_num <- dat_pov %>% select(long_wt, last_4_run) %>% filter(last_4_run ==1) %>% summarise(sum(long_wt)) last_4_num/tot_pop # probability that a random person spends the first five years in poverty dat_pov %>% summarise(perc = ( sum(if_else(run_01==5, 1, 0)*long_wt)) /sum(long_wt) ) # proportion in poverty in 2001 who are ALSO (not still) in poverty in 2016 pov01 <- dat_pov %>% filter(y2001==1) %>% select(long_wt) %>% sum() pov0116 <- dat_pov %>% filter(y2001==1 & y2016==1) %>% select(long_wt) %>% sum() pov0116/pov01 rm(pov01, pov0116) # proportion in poverty in 2001 who are ALSO (not still) in poverty in 2006 pov01 <- dat_pov %>% filter(y2001==1) %>% select(long_wt) %>% sum() pov0106 <- dat_pov %>% filter(y2001==1 & y2006==1) %>% select(long_wt) %>% sum() pov0106/pov01 rm(pov01, pov0106) # proportion in poverty in 2006 who are ALSO (not still) in poverty in 2011 pov06 <- dat_pov %>% filter(y2006==1) %>% select(long_wt) %>% sum() pov0611 <- dat_pov %>% filter(y2006==1 & y2011==1) %>% select(long_wt) %>% sum() pov0611/pov06 rm(pov06, pov0611) # proportion in poverty in 2011 who are ALSO (not still) in poverty in 2016 pov11 <- dat_pov %>% filter(y2011==1) %>% select(long_wt) %>% sum() pov1116 <- dat_pov %>% filter(y2011==1 & y2016==1) %>% select(long_wt) %>% sum() pov1116/pov11 rm(pov11, pov1116) # What proportion of OF THOSE WHO did not start out HILDA in poverty, moved into it after 2001? no_pov01 <- dat_pov %>% filter(y2001==0) %>% select(long_wt) %>% sum() # total years spent in poverty out of 15 (not 2001) dat_pov <- dat_pov %>% mutate(pov_total_excl01 = rowSums(select(., y2002:y2016 ) ) ) # perc spending at least 1 year in poverty one_plus_excl01 <- dat_pov %>% filter(pov_total_excl01 >= 1) %>% select(long_wt) %>% sum() # percentage one_plus_excl01/no_pov01 ################## # How many people move out of poverty, but then move back in? - looking for a 1,0,1 pattern in income poverty, which equate to a -1,1 pattern in pov_change dat_pov <- dat %>% select(xwaveid, year, long_wt, inc_poverty) %>% spread(year, inc_poverty) dat_pov2 <- dat %>% select(xwaveid, long_wt, year, inc_poverty) dat_pov2 <- dat_pov2 %>% mutate(pov_change = case_when ( year==2001 ~ as.numeric(0), TRUE ~ inc_poverty - lag(inc_poverty) # 1 signifies a move into poverty, -1 signifies a move out ) ) dat_change <- dat_pov2 %>% select(-inc_poverty) %>% filter(!year==2001) %>% # exclude 2001 because not possible to record change in poverty status spread(year, pov_change) # rename years so non-numeric and easier to work with dat_change <- rename(dat_change, y2002='2002', y2003='2003', y2004='2004',y2005='2005', y2006='2006', y2007='2007', y2008='2008' , y2009='2009', y2010='2010', y2011='2011', y2012='2012', y2013='2013', y2014='2014', y2015='2015', y2016='2016') # unite into one string dat_string <- unite(dat_change, change_string, y2002:y2016, sep="") #sub N for eNter poverty and X for eXit poverty dat_string <- mutate(dat_string, stringB = gsub(pattern="-1", replacement="X", dat_string$change_string)) dat_string <- mutate(dat_string, string = gsub(pattern="1", replacement="N", dat_string$stringB)) dat_string <- select(dat_string, xwaveid, long_wt, string) # match pattern X first then N in string dat_string <- mutate(dat_string, out_in = grepl(pattern = "X.*N", x=dat_string$string) ) ### what proportion of all people move out of poverty and then back in (population weighted) summarise(dat_string, wtd.mean(out_in, weight=long_wt)) # match pattern at least one X dat_string <- mutate(dat_string, out = grepl(pattern = "X", x=dat_string$string) ) # ccould use str_detect (stringer package) instead - returns TRUE, FALSE, FALSE # weighted sum of outs and out_ins total_out <- dat_string %>% filter(out==TRUE) %>% summarise(sum(long_wt)) total_out_in <- dat_string %>% filter(out_in==TRUE) %>% summarise(sum(long_wt)) ### What proportion of respondents who moved out of poverty during HILDA later moved back into poverty? total_out_in/total_out ################################## # For people who move out of poverty (X) and then back in (N), what is average amount of time spent out of poverty # only include those people with a match dat_out_in <- dat_string %>% filter(out_in==TRUE) # count number of matching strings dat_out_in <- dat_out_in %>% mutate(count = str_count(string=.$string, pattern = "X0*N")) # extract all strings - including multiple strings from those people that have more than one dat_out_in <- dat_out_in %>% mutate(match = str_extract_all(string=.$string, pattern = "X0*N")) # make each string an observation dat_out_in <- unnest(dat_out_in, match) # count lengths of extracted strings # substract one from string length - want number of periods spent out, not length of string (which includes period in which they returned) dat_out_in <- dat_out_in %>% mutate(time_out = str_length(string=.$match) - 1 ) # calcualted weighted mean of string length wtd.mean(dat_out_in$time_out, weight=dat_out_in$long_wt) ################### # For people who exit poverty (X) and then re-enter (N), how long are their spells out of poverty # 'about one half spend two or more years out of poverty' text in chapter 6. dat_out_in %>% group_by(time_out) %>% summarise(num = sum(long_wt)) %>% mutate(perc = num/sum(num)) %>% mutate(perc_culm = cumsum(perc)) ################################## # For people who move into poverty(N) and then out (X), what is average amount of time spent in poverty # count number of matches in string dat_string <- mutate(dat_string, in_out = str_count(string=dat_string$string, pattern="N0*X") ) # create dataframe just of those people with 1 or more match dat_in_out <- dat_string %>% select(xwaveid, long_wt, string, in_out) %>% filter(in_out>=1) # extract strings- including multiple strings from those people that have more than one dat_in_out <- dat_in_out %>% mutate(match = str_extract_all(string=.$string, pattern = "N0*X")) # make each string an observation dat_in_out <- unnest(dat_in_out, match) # count lengths of extracted strings # substract one from string length - want number of periods spent in, not length of string (which includes period in which they returned) dat_in_out <- dat_in_out %>% mutate(time_in = str_length(string=.$match) - 1 ) # calcualted weighted mean of time in wtd.mean(dat_in_out$time_in, weight=dat_in_out$long_wt) ############################################## # SURVIVAL FUNCTIONS # For people who move into poverty (N) whether or not they move out, how long to they spend in poverty - for SURVIVAL FUNCTION # remove excess stuff from dataframe dat_spell <- dat_string %>% select(xwaveid, long_wt, string) # count number of spells lasting at least x years dat_spell <- dat_spell %>% mutate("1" = str_count(string=dat_spell$string, pattern="N") , "2" = str_count(string=dat_spell$string, pattern="N0"), "3" = str_count(string=dat_spell$string, pattern="N00") , "4" = str_count(string=dat_spell$string, pattern="N000") , "5" = str_count(string=dat_spell$string, pattern="N0000") , "6" = str_count(string=dat_spell$string, pattern="N00000") , "7" = str_count(string=dat_spell$string, pattern="N000000") , "8" = str_count(string=dat_spell$string, pattern="N0000000") , "9" = str_count(string=dat_spell$string, pattern="N00000000") , "10" = str_count(string=dat_spell$string, pattern="N000000000") , "11" = str_count(string=dat_spell$string, pattern="N0000000000") , "12" = str_count(string=dat_spell$string, pattern="N00000000000") , "13" = str_count(string=dat_spell$string, pattern="N000000000000") , "14" = str_count(string=dat_spell$string, pattern="N0000000000000") , "15" = str_count(string=dat_spell$string, pattern="N00000000000000") , "16" = str_count(string=dat_spell$string, pattern="N000000000000000") ) # generate survival percentage - "% of poverty spells lasting at least x years" survival <- dat_spell %>% select(-xwaveid, -string) %>% gather(length, count, -long_wt) %>% group_by(length) %>% summarise(pop = sum(count*long_wt)) %>% mutate(length=as.numeric(length)) %>% arrange(length) %>% mutate(perc = 100*(pop/pop[1]) ) %>% filter(length !=1) # plot survival function survival %>% ggplot(aes(x=length, y=perc)) + geom_point(col=blue) + geom_line(col=blue) + scale_x_continuous(breaks = seq(2,16,1), name="Lasting at least this many years") + ylab("Percentage of poverty spells") ############################################################################################## # VOLATILITY OF INCOME and CONSUMPTION - CHAPTER 6 ##################################################################################### # Notes from Hardy 2016 - Sample is restricted to household heads between ages 25 and 60 that have non-negative pre-tax & pre-transfer income, # without imputed income, and excluding the top 5% of income over each annual sample to exclude topcodes. # percentage change formula follows Hardy 2016 and Dynan, Elmendorf and Sichel 2012 # should be no top coding for income. for consumption? #The top-coding thresholds are adjusted over time to overcome the tendency of income and wealth measures to inflate. #If you need to know the threshold values that have been used at a particular release, see “HILDA-thresholds-by-wave.csv” in the release documentation. # INCOME # positive values for eq_disp_inc only (p suffix) dat <- dat %>% arrange(xwaveid, year) %>% mutate(perc_change_incp = 100*( (eq_disp_incp - lag(eq_disp_incp) ) / ((abs(eq_disp_incp) + abs(lag(eq_disp_incp)))/2) ) ) # CONSUMPTION dat <- dat %>% arrange(xwaveid, year) %>% mutate(perc_change_cons = 100*( (eq_cons - lag(eq_cons) ) / ((abs(eq_cons) + abs(lag(eq_cons)))/2) ) ) ############# # VOLATILITY OF INCOME AND CONSUMPTION - POVERTY V NO POVERTY # group by whether or not in income poverty # consumption and income variability - averaged over 2006-2016 dat %>% filter(year>2006, age>=25, age<60) %>% # can't calculate income change for 2006. group_by(year, inc_poverty) %>% summarise(cons_sd = sqrt(wtd.var(x=perc_change_cons, weights = long_wt) ), inc_sd = sqrt(wtd.var(x=perc_change_incp, weights = long_wt) ) ) %>% gather(key = "measure", value="value", -year, -inc_poverty) %>% ungroup() %>% group_by(inc_poverty, measure) %>% summarise(average = mean(value)) %>% View() ###################################################################### # POVERTY (SAMPLE 2013 to 2016) ###################################################################### # use long_13thru16 sample for this # create poverty dataset and put in wide format dat_pov13 <- dat %>% select(xwaveid, year, wlem_p, inc_poverty) %>% rename(long_wt_last4 = wlem_p) %>% # weights for those enumerated in the last 4 years spread(year, inc_poverty) # rename years so non-numeric and easier to work with dat_pov13 <- rename(dat_pov13, y2013='2013', y2014='2014', y2015='2015', y2016='2016') # total population for this dataset tot_pop13 <- dat_pov13 %>% summarise(sum(long_wt_last4)) # percent who have been in poverty for this year and the previous 3 years. last 4 years - 3.84% (compared to 3.57% using full sample) (exec summary) dat_pov13 <- mutate(dat_pov13, last_4_run = ifelse(y2013==1 & y2014==1 & y2015==1 & y2016==1, 1, 0)) last_4_num13 <- dat_pov13 %>% select(long_wt_last4, last_4_run) %>% filter(last_4_run ==1) %>% summarise(sum(long_wt_last4)) # percent of population last_4_num13/tot_pop13 # percent of population as at June 2016 ((http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/3101.0Dec%202017?OpenDocument) 24190907*(last_4_num13/tot_pop13)