# Productivity Commission 2018 analysis for the Rising Inequality? Commission Research Paper

# Longitudinal HILDA analysis


#################################################################################
# ATTACH TO ALL OBSERVATIONS - LONGITUDINAL WEIGHTS 2001 THROUGH 2016

# create file with 2001 through 2016 weights for each enumerated and responding person
dat2 <-  dat %>%
  filter(year==2016) %>%
  select (xwaveid, lnwte, lnwtrp) %>% 
  rename(long_wt=lnwte, long_wt_rp = lnwtrp)

# join to main file and remove old object
dat <- left_join(dat, dat2, by="xwaveid")
rm(dat2)


##########################################
# INCOME DECILES AND POVERTY
##########################################

# weighted cut points       
cuts_long <- dat %>%
  group_by(year)  %>%
  do(data.frame(t(wtd.quantile(.$eq_disp_inc, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>%  
  ungroup() %>% 
  rename(cut0=X..0., cut10=X.10., cut20=X.20., cut30=X.30., cut40=X.40., cut50=X.50., cut60=X.60., cut70=X.70., cut80=X.80., cut90=X.90., cut100=X100.)

# add in 50% of income variable
cuts_long <- mutate(cuts_long, half_median = 0.5*cut50)

# join weighted cut points to the dataframe
dat <- left_join(dat, cuts_long, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat <-  dat %>%
  mutate(inc_dec = case_when(
    eq_disp_inc <= cut10                       ~ 1,
    eq_disp_inc > cut10 & eq_disp_inc <= cut20 ~ 2,
    eq_disp_inc > cut20 & eq_disp_inc <= cut30 ~ 3,
    eq_disp_inc > cut30 & eq_disp_inc <= cut40 ~ 4,
    eq_disp_inc > cut40 & eq_disp_inc <= cut50 ~ 5,
    eq_disp_inc > cut50 & eq_disp_inc <= cut60 ~ 6,
    eq_disp_inc > cut60 & eq_disp_inc <= cut70 ~ 7,
    eq_disp_inc > cut70 & eq_disp_inc <= cut80 ~ 8,
    eq_disp_inc > cut80 & eq_disp_inc <= cut90 ~ 9,
    eq_disp_inc > cut90                        ~ 10,
    TRUE ~ -99
  )
  )


# assign each observation a 1 if equivalised household disposable income less than half of median, 0 otherwise
dat <- dat %>%
  mutate(inc_poverty = case_when(
    eq_disp_inc < half_median ~ 1,
    eq_disp_inc >= half_median ~ 0,
    TRUE ~ -99)
  )



#################################################################
# WEALTH DECILES
##############################################################

# weighted cut points       
cuts_long_wealth <- dat %>%
  group_by(year)  %>%
  filter(year %in% wealth_years) %>%
  do(data.frame(t(wtd.quantile(.$eq_wealth, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1)) ) ) ) %>% 
  ungroup() %>% 
rename( wealth_cut0=X..0., wealth_cut10=X.10., wealth_cut20=X.20., wealth_cut30=X.30., wealth_cut40=X.40., wealth_cut50=X.50., 
                            wealth_cut60=X.60., wealth_cut70=X.70., wealth_cut80=X.80., wealth_cut90=X.90., wealth_cut100=X100.)

# join weighted cut points to the dataframe
dat <- left_join(dat, cuts_long_wealth, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat <-  dat %>%
  mutate(wealth_dec = case_when(
    eq_wealth <= wealth_cut10                       ~ 1,
    eq_wealth > wealth_cut10 & eq_wealth <= wealth_cut20 ~ 2,
    eq_wealth > wealth_cut20 & eq_wealth <= wealth_cut30 ~ 3,
    eq_wealth > wealth_cut30 & eq_wealth <= wealth_cut40 ~ 4,
    eq_wealth > wealth_cut40 & eq_wealth <= wealth_cut50 ~ 5,
    eq_wealth > wealth_cut50 & eq_wealth <= wealth_cut60 ~ 6,
    eq_wealth > wealth_cut60 & eq_wealth <= wealth_cut70 ~ 7,
    eq_wealth > wealth_cut70 & eq_wealth <= wealth_cut80 ~ 8,
    eq_wealth > wealth_cut80 & eq_wealth <= wealth_cut90 ~ 9,
    eq_wealth > wealth_cut90                        ~ 10,
    TRUE ~ -99
  )
  )


#############################################################################
# CONSUMPTION
###########################################################################
# CREATE CONSUMPTION DATAFRAME
# drop those not appropriate for consumption - generate new consumption data frame
# those before 2006  (and those with missing rent (<0) )
dat_cons <- dat %>% 
  filter(year >= 2006) %>% 
  subset(eq_rent >= 0) 


# CONSUMPTION DECILES

# weighted cut points       
cuts_long_cons <- dat_cons %>%
  group_by(year)  %>%
  do(data.frame(t(wtd.quantile(.$eq_cons, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1)) ) ) ) %>% 
  ungroup() %>% 
  rename(cons_cut0=X..0., cons_cut10=X.10., cons_cut20=X.20., cons_cut30=X.30., cons_cut40=X.40., cons_cut50=X.50., 
       cons_cut60=X.60., cons_cut70=X.70., cons_cut80=X.80., cons_cut90=X.90., cons_cut100=X100.)


# join weighted cut points to the dataframe
dat_cons <- left_join(dat_cons, cuts_long_cons, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat_cons <-  dat_cons %>%
  mutate(cons_dec = case_when(
    eq_cons <= cons_cut10                       ~ 1,
    eq_cons > cons_cut10 & eq_cons <= cons_cut20 ~ 2,
    eq_cons > cons_cut20 & eq_cons <= cons_cut30 ~ 3,
    eq_cons > cons_cut30 & eq_cons <= cons_cut40 ~ 4,
    eq_cons > cons_cut40 & eq_cons <= cons_cut50 ~ 5,
    eq_cons > cons_cut50 & eq_cons <= cons_cut60 ~ 6,
    eq_cons > cons_cut60 & eq_cons <= cons_cut70 ~ 7,
    eq_cons > cons_cut70 & eq_cons <= cons_cut80 ~ 8,
    eq_cons > cons_cut80 & eq_cons <= cons_cut90 ~ 9,
    eq_cons > cons_cut90                        ~ 10,
    TRUE ~ -99
  )
  )


# join consumption deciles to main frame
cons_join <- dat_cons %>% 
  select(xwaveid, year, cons_dec)

dat <- left_join(dat, cons_join, by=c("xwaveid","year"))

rm(cons_join)







########################################################################################################
# BEGINNING OF OPTIONAL ANALYSIS - CAN BE DONE BIT BY BIT
######################################################################################################





#########################################################
# MOBILITY
###########################################################


#############
# MOVEMENT BETWEEN INCOME DECILES - MOBILITY GRID - TABLE 5.1 (INCOME)

inc_path0116 <- as.data.frame(10*prop(wtd.table(dat$inc_dec[dat$year == 2001], 
                                                dat$inc_dec[dat$year == 2016],
                                                dat$long_wt[dat$year == 2016])))

colnames(inc_path0116) <- c("inc_dec01", "inc_dec16", "per_cent")

inc_path0116 <- inc_path0116 %>% filter(inc_dec01!="Total", inc_dec16!="Total")

# for table - wide to long
inc_path0116 %>% 
  spread(inc_dec16, per_cent)


#############
# MOVEMENT BETWEEN WEALTH DECILES  - MOBILITY GRID - TABLE 5.2 (WEALTH)


wealth_path0214_B <- as.data.frame(10*prop(wtd.table(dat$wealth_dec[dat$year == 2002], 
                                                     dat$wealth_dec[dat$year == 2014],
                                                     dat$long_wt[dat$year == 2016])))

colnames(wealth_path0214_B) <- c("wealth_dec02", "wealth_dec14", "per_cent")

wealth_path0214_B <- wealth_path0214_B %>% filter(wealth_dec02!="Total", wealth_dec14!="Total")

# for table - wide to long
wealth_path0214_B %>% 
  spread(wealth_dec14, per_cent)

# percentage of people that are more than $6000 above the top income bracket (>$96000)
# 'but most of those in the top decile would remain in the top decile if they incurred the same drop in income' (in text, chap 5)

dat %>%
  select(hhwt, eq_disp_inc) %>% 
  mutate(stuck_high = ifelse(eq_disp_inc > 96000,1,0) ) %>% 
  summarise(perc = sum(hhwt*stuck_high)/sum(hhwt) )



####################################
# Sankey/Alluvial diagrams - Figure 5.5 (main Sankey diagram)

# get data in right format - all deciles alluvial
dat_al <- dat %>%
  select(xwaveid, year, inc_dec, long_wt) %>% 
  filter(year==2001 | year==2016) %>% 
  spread(key=year, value=inc_dec) %>% 
  rename(d2001='2001', d2016='2016') %>% 
  group_by(d2001, d2016) %>% 
  summarise(pop = sum(long_wt) )



# emf(file = here("HILDA", "charts", "alluvial_dec_hide.emf"), width = 5.90551, height = 7.7)

# alluvial with all deciles - only three colours showing, rest white (for report - tall version)
alluvial(dat_al[,1:2], 
         freq=dat_al$pop,
         hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9),
         col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow,
                          dat_al$d2001==5 ~ alluvial_blue,
                          dat_al$d2001==10 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA,
         alpha=0.7,
         # gap.width=0,
         axis_labels = c("2000-01","2015-16"),
         cw=0.05,
         xw=0.4)

# dev.off()



# same chart - for presentation on slide (couldn't get it for dark background though) - wide version. different font.

# emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation.emf"), width = 9.5, height = 5.5)

# svg(file = here("HILDA", "charts", "alluvial_dec_hide_presentation.svg"))

# alluvial with all deciles - only three colours showing, rest white
alluvial(dat_al[,1:2], 
         freq=dat_al$pop,
         hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9),
         col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow,
                          dat_al$d2001==5 ~ alluvial_blue,
                          dat_al$d2001==10 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("2000-01","2015-16"),
         cex = 1.5, # font size
         cex.axis = 1.5, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()


# just top decile
# emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation_top_only.emf"), width = 9.5, height = 5.5)

alluvial(dat_al[,1:2], 
         freq=dat_al$pop,
         hide = dat_al$d2001 %in% c(1, 2,3,4,5,6,7,8,9),
         col = case_when( #dat_al$d2001 == 1 ~ alluvial_yellow,
                         # dat_al$d2001==5 ~ alluvial_blue,
                          dat_al$d2001==10 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("2000-01","2015-16"),
         cex = 1.5, # font size
         cex.axis = 1.5, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()

# just top and bottom
# emf(file = here("HILDA", "charts", "alluvial_dec_hide_presentation_top_and_bottom.emf"), width = 9.5, height = 5.5)

alluvial(dat_al[,1:2], 
         freq=dat_al$pop,
         hide = dat_al$d2001 %in% c(2,3,4,5, 6,7,8,9),
         col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow,
                        #  dat_al$d2001==5 ~ alluvial_blue,
                          dat_al$d2001==10 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("2000-01","2015-16"),
         cex = 1.5, # font size
         cex.axis = 1.5, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()


# all ten deciles - taste the rainbow! :)
# emf(file = here("HILDA", "charts", "alluvial_dec_presentation_10.emf"), width = 9.5, height = 5.5)

alluvial(dat_al[,1:2], 
         freq=dat_al$pop,
        # hide = dat_al$d2001 %in% c(2,3,4,6,7,8,9),
         col = case_when( dat_al$d2001 == 1 ~ alluvial_yellow,
                          dat_al$d2001==5 ~ alluvial_blue,
                          dat_al$d2001==10 ~ alluvial_green,
                          dat_al$d2001==2 ~ orange,
                          dat_al$d2001==3 ~ red,
                          dat_al$d2001==4 ~ purple,
                          dat_al$d2001==6 ~ mid_blue,
                          dat_al$d2001==7 ~ dark_blue,
                          dat_al$d2001==8 ~ dark_green,
                          dat_al$d2001==9 ~ PC_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("2000-01","2015-16"),
         cex = 1.5, # font size
         cex.axis = 1.5, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()



#######
# Sankey/Alluvial - Absolute income values - 3 categories, fixed levels (Absolute mobility v relative mobility) - Figure 5.10

# get data sorted
dat_quin_abs <- dat %>%
  select(xwaveid, year, eq_disp_inc, long_wt) %>% 
  mutate(inc_quin = case_when(
    eq_disp_inc <30000 ~ "$30k and less",
    eq_disp_inc >=30000 & eq_disp_inc <50000 ~ "$30k to $50k",
    eq_disp_inc >=50000 ~ "$50k and up",
    TRUE ~ "Problem" ) ) %>% 
  select(-eq_disp_inc) %>% 
  filter(year==2001 | year==2016) %>% 
  spread(key=year, value=inc_quin) %>% 
  rename(d2001='2001', d2016='2016') %>% 
  group_by(d2001, d2016) %>% 
  summarise(pop = sum(long_wt) )


# emf(file = here("HILDA", "charts", "alluvial_abs.emf"), width = 5.90551, height = 3.14961)

# chart
alluvial(dat_quin_abs[,1:2], 
         freq=dat_quin_abs$pop,
         col = case_when(dat_quin_abs$d2001 == "$30k and less" ~ "#A52828",
                         dat_quin_abs$d2001=="$30k to $50k" ~ "#66BCDB",
                         dat_quin_abs$d2001=="$50k and up" ~ "#265A9A",
                         TRUE ~ "grey") ,
         border=NA,
         alpha=0.7,
         axis_labels = c("2000-01","2015-16"),
         gap.width=0.0, # relative width of inter-category gaps
         xw=0.3, # the distance from the set axis to the control points of the xspline
         cw=0.12, # width of the category axis
         cex=1) # font size of category labels

# dev.off()


# using this dataset - how many people END UP in each category? - under $30,000 = 22.7%, $30,000 to $50,000 = 31.6%, more than $50,000 = 45.6%
dat_quin_abs %>% 
  group_by(d2016) %>% 
  mutate(total_pop = sum(pop) ) %>% 
  ungroup() %>% 
  mutate(perc = 3*total_pop/sum(total_pop)) %>% 
  arrange(d2016) %>% 
  View()

# using this dataset - how many people START in each category? - under $30,000 = 33.6%, $30,000 to $50,000 = 38.7%, more than $50,000 = 27.6%
dat_quin_abs %>% 
  group_by(d2001) %>% 
  mutate(total_pop = sum(pop) ) %>% 
  ungroup() %>% 
  mutate(perc = 3*total_pop/sum(total_pop)) %>% 
  arrange(d2001) %>% 
  View()



############################
# US alluvial for income mobility - From 'Fisher et al 2016 Inequaltiy and Mobility using income, consumption, wealth for the same individuals' (p. 54)

# NOTES:
# restricted to adults who are between 25 and 50 in 1999 (p. 53) 
# income is reported for the previous taxable year, measures are in 2013 constant dollars, equivalence scale given by square root of family size, (p. 49)
# we use family level file - but other measures are at individual level (p. 49). Also say 'individual mobility' (p. 53) and talk about 53 percent of people' (p. 53,
# so on balance, I think they are using individual level for mobility.

US_al <- read.csv(file=here("US intragenerational income mobility, Fisher et al 2016.csv"))

# emf(file = here("HILDA", "charts", "alluvial_US.emf"), width = 2.95, height = 3.14961)

# svg(file = here("HILDA", "charts", "alluvial_US.svg"))

alluvial(US_al[,1:2], 
         freq=US_al$pop,
         hide = US_al$q1999 %in% c(2,4),
         col = case_when( US_al$q1999 == 1 ~ alluvial_yellow,
                          US_al$q1999==3 ~ alluvial_blue,
                          US_al$q1999==5 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("1998-99","2012-13"),
         cex = 1, # font size
         cex.axis = 1, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()

##################
# Australian version to match up with the US (2000-01 to 2014-15, closest matching time period)

# dat <- mutate(dat, sqrt_eq_disp_inc = hh_disp_inc/square_root_eq)


# INCOME DECILES - SQUARE ROOT EQUIVALENCE SCALE

# weighted cut points       
cuts_long_sqrt <-  dat %>%
 filter( (year==2001 & (age>25 & age<50) ) | (year==2015 & (age>39 & age<64) ) | year %in% c(2002:2014) | year==2016 ) %>%    # for 2001 and 2015, only use people within age groups
  group_by(year)  %>%
  do(data.frame(t(wtd.quantile(.$sqrt_eq_disp_inc, weights=.$long_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>%  
  ungroup() %>% 
  rename(cut_sqrt0=X..0., cut_sqrt10=X.10., cut_sqrt20=X.20., cut_sqrt30=X.30., cut_sqrt40=X.40., cut_sqrt50=X.50., 
         cut_sqrt60=X.60., cut_sqrt70=X.70., cut_sqrt80=X.80., cut_sqrt90=X.90., cut_sqrt100=X100.)

# join weighted cut points to the dataframe
dat <- left_join(dat, cuts_long_sqrt, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat <-  dat %>%
  mutate(sqrt_inc_dec = case_when(
    sqrt_eq_disp_inc <= cut_sqrt10                       ~ 1,
    sqrt_eq_disp_inc > cut_sqrt10 & sqrt_eq_disp_inc <= cut_sqrt20 ~ 2,
    sqrt_eq_disp_inc > cut_sqrt20 & sqrt_eq_disp_inc <= cut_sqrt30 ~ 3,
    sqrt_eq_disp_inc > cut_sqrt30 & sqrt_eq_disp_inc <= cut_sqrt40 ~ 4,
    sqrt_eq_disp_inc > cut_sqrt40 & sqrt_eq_disp_inc <= cut_sqrt50 ~ 5,
    sqrt_eq_disp_inc > cut_sqrt50 & sqrt_eq_disp_inc <= cut_sqrt60 ~ 6,
    sqrt_eq_disp_inc > cut_sqrt60 & sqrt_eq_disp_inc <= cut_sqrt70 ~ 7,
    sqrt_eq_disp_inc > cut_sqrt70 & sqrt_eq_disp_inc <= cut_sqrt80 ~ 8,
    sqrt_eq_disp_inc > cut_sqrt80 & sqrt_eq_disp_inc <= cut_sqrt90 ~ 9,
    sqrt_eq_disp_inc > cut_sqrt90                        ~ 10,
    TRUE ~ -99
  )
  )

# turn deciles into quintiles
dat <- dat %>%
  mutate(sqrt_inc_quin = case_when(
    sqrt_inc_dec %in% c(1,2) ~ 1,
    sqrt_inc_dec %in% c(3,4) ~ 2,
    sqrt_inc_dec %in% c(5,6) ~ 3,
    sqrt_inc_dec %in% c(7,8) ~ 4,
    sqrt_inc_dec %in% c(9,10) ~ 5,
    TRUE ~ -99  ) )

# get data sorted for alluvial chart

# dat_al_sqrt_quin <- dat %>%
#   select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% 
#   filter((year==2001 & (age>25 & age<50) ) | (year==2015 & (age>39 & age<64) ) )  %>%  View()          # using these years as closest match to US data.
#   spread(key=year, value=sqrt_inc_quin) %>% 
#   rename(q2001='2001', q2015='2015') %>%


  
quin2001 <- dat %>%
  select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% 
  filter((year==2001 & (age>25 & age<50) ) ) %>% 
  rename(q2001 = sqrt_inc_quin) %>% 
  select(xwaveid, q2001) 
  
quin2015 <- dat %>%
  select(xwaveid, year, sqrt_inc_quin, long_wt, age) %>% 
  filter((year==2015 & (age>39 & age<64)) ) %>% 
  rename(q2015 = sqrt_inc_quin) %>% 
  select( xwaveid, q2015, long_wt ) 

dat_al_sqrt_quin <-  left_join(quin2001, quin2015, by="xwaveid")


# calculations - population and percentage
dat_al_sqrt_quin <- dat_al_sqrt_quin %>% 
  group_by(q2001, q2015) %>% 
  summarise(pop = sum(long_wt) ) %>% 
  ungroup() %>% 
  group_by(q2001) %>% 
  mutate(perc = 100*pop/sum(pop)) 


# emf(file = here("HILDA", "charts", "alluvial_Aus_match_US.emf"), width = 2.95, height = 3.14961)

# svg(file = here("HILDA", "charts", "alluvial_Aus_match_US.svg"))

# alluvial chart - Australi to match the US
alluvial(dat_al_sqrt_quin[,1:2], 
         freq=dat_al_sqrt_quin$pop,
         hide = dat_al_sqrt_quin$q2001 %in% c(2,4),
         col = case_when( dat_al_sqrt_quin$q2001 == 1 ~ alluvial_yellow,
                          dat_al_sqrt_quin$q2001==3 ~ alluvial_blue,
                          dat_al_sqrt_quin$q2001==5 ~ alluvial_green,
                          TRUE ~ grey) ,
         border=NA, # border on ribbons
         alpha=0.7, 
         # gap.width=0,
         axis_labels = c("2000-01","2014-15"),
         cex = 1, # font size
         cex.axis = 1, # font size labels
         cw=0.08,
         xw=0.4)

# dev.off()


#############################################
# MOBILITY STATS

#####
# how many people spend at least one period in the top income decile

pop_top <- dat %>%
  select(xwaveid, long_wt, year, inc_dec) %>% 
  mutate(top = ifelse(inc_dec==10, 1, 0) ) %>% 
  select(-inc_dec) %>%
  arrange(xwaveid, year) %>% 
  mutate(tot=cumsum(top)) %>% 
  filter(year==2016) %>% 
  mutate(num_top = tot-lag(tot)) %>% 
  mutate(winner = ifelse(num_top>0, 1, 0)) %>% 
  filter(winner==1) %>% 
  summarise(sum(long_wt))

tot_pop <- dat %>%
  filter(year==2016) %>% 
  summarise(sum(long_wt))

pop_top/tot_pop


#####
# how many people spend at least one period in the bottom decile

pop_bottom <- dat %>%
  select(xwaveid, long_wt, year, inc_dec) %>% 
  mutate(bottom = ifelse(inc_dec==1, 1, 0) ) %>% 
  select(-inc_dec) %>%
  arrange(xwaveid, year) %>% 
  mutate(tot=cumsum(bottom)) %>% 
  filter(year==2016) %>% 
  mutate(num_bottom = tot-lag(tot)) %>% 
  mutate(one_bottom = ifelse(num_bottom>0, 1, 0)) %>% 
  filter(one_bottom==1) %>% 
  summarise(sum(long_wt))

tot_pop <- dat %>%
  filter(year==2016) %>% 
  summarise(sum(long_wt))

pop_bottom/tot_pop


#####
# how many people spent at least one period in the top and one period in the bottom deciles - (in chapter 3)

pop_top_bottom <- dat %>%
  select(xwaveid, long_wt, year, inc_dec) %>% 
  mutate(bottom = ifelse(inc_dec==1, 1, 0),
         top = ifelse(inc_dec==10, 1, 0) ) %>% 
  select(-inc_dec) %>%
  arrange(xwaveid, year) %>% 
  mutate(tot_bottom=cumsum(bottom),
         tot_top = cumsum(top) ) %>% 
  filter(year==2016) %>% 
  mutate(num_bottom = tot_bottom - lag(tot_bottom),
         num_top = tot_top - lag(tot_top) ) %>% 
  mutate(top_bottom = ifelse(num_bottom>0 & num_top>0, 1, 0)) %>% 
  filter(top_bottom==1) %>% 
  summarise(sum(long_wt))

tot_pop <- dat %>%
  filter(year==2016) %>% 
  summarise(sum(long_wt))

pop_top_bottom/tot_pop

###############
# how many income deciles does each person inhabit over the whole period? - INCOME (in key points)

# on average
dat %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_inc_dec = n_distinct(inc_dec)) %>% 
  ungroup() %>% 
  summarise(avg_inc_dec = wtd.mean(n_inc_dec, long_wt) )


# how many people stay in the same decile, by starting income decile
dat %>% 
  arrange(xwaveid, year) %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_inc_dec = n_distinct(inc_dec),
            inc_dec01 = first(inc_dec)) %>% 
  ungroup() %>% 
  group_by(inc_dec01, n_inc_dec) %>%
  summarise(group_total = sum(long_wt)) %>% 
  ungroup() %>% 
  group_by(inc_dec01) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  View()


# how many people were in both top income and top wealth decile in all years
entrenched_advantage <-  dat %>%
  select(xwaveid, long_wt, year, inc_dec, wealth_dec) %>% 
   group_by(xwaveid, long_wt) %>% 
  filter(n_distinct(inc_dec)==1,
         inc_dec==10) %>% 
    ungroup() %>% 
  filter(year %in% wealth_years) %>% 
      group_by(xwaveid, long_wt) %>%
  filter(n_distinct(wealth_dec)==1,
         wealth_dec==10) %>% 
  ungroup() %>% 
  filter(year==2014) %>% 
  summarise(group_total = sum(long_wt) )
         
  tot_pop <- dat %>%
    filter(year==2014) %>% 
    summarise(sum(long_wt))
  
entrenched_advantage/tot_pop
       

# how many people spent at least one year in a decile (by decile)

dat %>%
  select(xwaveid, long_wt, inc_dec, year) %>% 
  group_by(year) %>% 
  mutate(tot = sum(long_wt)) %>% 
  ungroup() %>% 
  group_by(inc_dec) %>%
  distinct(xwaveid, .keep_all=TRUE) %>% 
  summarise(group_total = sum(long_wt)/ mean(tot) ) %>% 
  View()



############
# how many WEALTH deciles does each person inhabit over the whole period? - WEALTH


# how many people stay in the same decile, by starting wealth deciles
dat %>% 
  filter(year %in% wealth_years) %>% 
  arrange(xwaveid, year) %>% 
  select(xwaveid, long_wt, year, wealth_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_wealth_dec = n_distinct(wealth_dec),
            wealth_dec02 = first(wealth_dec)) %>% 
  ungroup() %>% 
  group_by(wealth_dec02, n_wealth_dec) %>%
  summarise(group_total = sum(long_wt)) %>% 
  ungroup() %>% 
  group_by(wealth_dec02) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  View()



#######################################################
# what is the SPAN of income decile inhabited by each person (use View to see numbers) - Figure 5.4

# emf(file = here("HILDA", "charts", "span_income.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(span=max(inc_dec)-min(inc_dec)) %>% 
  ungroup() %>% 
  group_by(span) %>% 
  summarise(group_total = sum(long_wt) ) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ggplot(aes(x=span, y=perc)) +
  geom_col(fill=dark_blue) +
  scale_y_continuous(name="Per cent", limits = c(0, 20), expand = c(0.0001, 0.0001)) +
  scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm") )

# dev.off()


# for presentation on dark background

# emf(file = here("HILDA", "charts", "span_income_presentation.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(span=max(inc_dec)-min(inc_dec)) %>% 
  ungroup() %>% 
  group_by(span) %>% 
  summarise(group_total = sum(long_wt) ) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ggplot(aes(x=span, y=perc)) +
  geom_col(fill=blue) +
  scale_y_continuous(name="Per cent", limits = c(0, 20), expand = c(0.0001, 0.0001)) +
  scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))

# dev.off()





# what is the SPAN - WEALTH - histogram - (use View to see numbers) - Figure 5.6

# emf(file = here("HILDA", "charts", "span_wealth_presentation.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  select(xwaveid, long_wt, year, wealth_dec) %>% 
  filter(year %in% wealth_years) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(span=max(wealth_dec)-min(wealth_dec)) %>% 
  ungroup() %>% 
  group_by(span) %>% 
  summarise(group_total = sum(long_wt) ) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ggplot(aes(x=span, y=perc)) +
  geom_col(fill=blue) +
  scale_y_continuous(name="Per cent", limits = c(0, 30), expand = c(0.0001, 0.0001)) +
  scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) +
  # geom_hline(yintercept = 0, colour = grey) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white) )


# dev.off()


# same chart - for presentation with dark background
# what is the SPAN - WEALTH - histogram - (use View to see numbers) - Figure 5.6 

# emf(file = here("HILDA", "charts", "span_wealth.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  select(xwaveid, long_wt, year, wealth_dec) %>% 
  filter(year %in% wealth_years) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(span=max(wealth_dec)-min(wealth_dec)) %>% 
  ungroup() %>% 
  group_by(span) %>% 
  summarise(group_total = sum(long_wt) ) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ggplot(aes(x=span, y=perc)) +
  geom_col(fill=dark_blue) +
  scale_y_continuous(name="Per cent", limits = c(0, 30), expand = c(0.0001, 0.0001)) +
  scale_x_continuous(name="Difference between top and bottom deciles", breaks=seq(0,9), expand = c(0.01, 0.01) ) +
  # geom_hline(yintercept = 0, colour = grey) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm") )


# dev.off()



# span of income but only looking at wealth years, for comparison with wealth (in text, chap 5)
# "40 per cent had a difference of three or more deciles between the top and bottom wealth deciles they spent time in, 
# while 60 per cent of people had a difference of three or more deciles between the top and bottom income deciles they spent time in"

dat %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  filter(year %in% wealth_years) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(span=max(inc_dec)-min(inc_dec)) %>% 
  ungroup() %>% 
  group_by(span) %>% 
  summarise(group_total = sum(long_wt) ) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  arrange(desc(span)) %>% 
  mutate(culm_perc = cumsum(perc)) %>%  View()




# how many people stay in the same decile, by starting income decile (ONLY WEALTH YEARS) (for in text comparison)
# "People were also twice as likely to remain in the same wealth decile over this period than they were to remain in the same income decile"  

# income deciles

dat %>% 
  arrange(xwaveid, year) %>% 
  filter(year %in% wealth_years) %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_inc_dec = n_distinct(inc_dec),
            inc_dec01 = first(inc_dec)) %>% 
  ungroup() %>% 
  group_by(inc_dec01, n_inc_dec) %>%
  summarise(group_total = sum(long_wt)) %>% 
  ungroup() %>% 
  group_by(inc_dec01) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ungroup() %>% 
  arrange(n_inc_dec) %>% 
  mutate(culm_perc = cumsum(perc)) %>% 
  View()


# how many people stay in the same decile, by starting wealth deciles

dat %>% 
  filter(year %in% wealth_years) %>% 
  arrange(xwaveid, year) %>% 
  select(xwaveid, long_wt, year, wealth_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_wealth_dec = n_distinct(wealth_dec),
            wealth_dec02 = first(wealth_dec)) %>% 
  ungroup() %>% 
  group_by(wealth_dec02, n_wealth_dec) %>%
  summarise(group_total = sum(long_wt)) %>% 
  ungroup() %>% 
  group_by(wealth_dec02) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  ungroup() %>% 
  arrange(n_wealth_dec) %>% 
  mutate(culm_perc = cumsum(perc)) %>% 
  View()


# income deciles - all years - how many stuck in the bottom two deciles - 2.77% (exec summary, in text) 

bottom_2_inc <- dat %>% 
  arrange(xwaveid, year) %>% 
  select(xwaveid, long_wt, year, inc_dec) %>% 
  group_by(xwaveid, long_wt) %>% 
  summarise(n_inc_dec = n_distinct(inc_dec),
            max_dec = max(inc_dec)  ) %>% 
  ungroup() %>% 
  filter(n_inc_dec <= 2) %>% 
  filter(max_dec <= 2) %>%
  summarise(group_total = sum(long_wt)) 

tot_pop <- dat %>%
  filter(year==2016) %>% 
  summarise(sum(long_wt))

bottom_2_inc/tot_pop




##################################################
# MOBILITY - RETIREMENT AND COUPLING/DECOUPLING (Table 5.3)


########################################
# Retirement - association with decile changes (Table 5.3)


# change factor to integer
dat <- mutate(dat, just_retired = as.integer(lertr) - 11) 
# 0 = No, not just retired
# 1 = yes, just retired


# comparing those that retired in the last year, and those that didn't (over 18) - total amounts filling - numbers (percents) - responding person - big or small shift

dat %>% 
  select(xwaveid, year, long_wt_rp, age, inc_dec, just_retired, rp) %>% 
  arrange(xwaveid, year) %>% 
  mutate(inc_shift = lead(inc_dec, 2) - lag(inc_dec, 1)) %>%   # numbers determine how many periods to lead or lag - retirement may effect income in this period and one period into the future (as income recorded for previous financial year)
  filter(year %in% seq(2002, 2014) ) %>%   # - need to adjust year to take account of lag/lead (so don't grab inc_dec from different people)
  filter(just_retired == 1 | just_retired == 0) %>% # select on those that answer either yes or no to 'just retired'
  filter(age>=18) %>% 
  filter(rp==1) %>% 
  mutate(shift_size = case_when(
    inc_shift >= 4 ~ "big up",
    inc_shift < 4 & inc_shift > 0 ~ "small up",
    inc_shift == 0 ~ "no shift",
    inc_shift <= -4 ~ "big down",
    inc_shift > -4 & inc_shift < 0 ~ "small down",
    TRUE ~ "wrong") ) %>% 
  group_by(shift_size, just_retired) %>% 
  summarise(group_total = sum(long_wt_rp)) %>% 
  ungroup() %>% 
  group_by(shift_size) %>% 
  mutate(perc = group_total/sum(group_total)) %>% View()




###########################################
# Coupling/Decoupling - association with decile changes (Table 5.3 and preceding para)

# mrcurr variable seems to be the best quality variable if i go to the effort of calculating my own (it is derived from other answers, and includes married and defacto for all years)
# https://www.online.fbe.unimelb.edu.au/HILDAodd/KWCrossWaveCategoryDetails.aspx?varnt=mrcurr


dat <- dat %>%
  mutate(mar_status = as.integer(mrcurr) - 10) 
# small number of refused and don't know
# 1 = legally married
# 2 = de facto
# 3 = separated
# 4 = divorced
# 5 = widowed
# 6 = never married and not defacto

# change in marriage status with previous year
dat <- dat %>%
  arrange(xwaveid, year) %>% 
  mutate(mar_change = case_when(
    mar_status>=3 & lag(mar_status)<=2 ~ "decoupled", # de-coupled in past year
    mar_status<=2 & lag(mar_status)>=3 ~ "coupled",
    TRUE ~ "no change") )


# calculate change in mar status between wealth years
dat_wealth_shift <- dat %>%
  arrange(xwaveid, year) %>% 
  filter(year %in% wealth_years) %>% 
  mutate(mar_change_wealth = case_when(
    mar_status>=3 & lag(mar_status)<=2 ~ "decoupled", # de-coupled in past year
    mar_status<=2 & lag(mar_status)>=3 ~ "coupled",
    TRUE ~ "no change") )


#  percentage of people experiencing given INCOME shift who coupled/decouples/no change (Table 5.3)
dat %>%
  filter(rp==1) %>% 
  select(xwaveid, year, inc_dec, long_wt_rp, mar_change) %>%
  arrange(xwaveid, year) %>%
  mutate(inc_shift = lead(inc_dec, 2) - lag(inc_dec, 1)) %>%   # numbers determine how many periods to lead or lag - retirement may effect income in this period and one period into the future
  filter(year %in% seq(2002, 2014) ) %>%   # - need to adjust year to take account of lag/lead (so don't grab inc_dec from different people)
  mutate(shift_size = case_when(
    inc_shift >= 4 ~ "big up",
    inc_shift < 4 & inc_shift > 0 ~ "small up",
    inc_shift == 0 ~ "no shift",
    inc_shift <= -4 ~ "big down",
    inc_shift > -4 & inc_shift < 0 ~ "small down",
    TRUE ~ "wrong") ) %>% 
  group_by(shift_size, mar_change) %>% 
  summarise(group_total = sum(long_wt_rp)) %>% 
  ungroup() %>% 
  group_by(shift_size) %>% 
  mutate(perc = group_total/sum(group_total)) %>% View()


# percentage of people experiencing given WEALTH shift who coupled/decouples/no change (Table 5.3)

dat_wealth_shift %>% 
  filter(rp==1) %>% 
  select(xwaveid, year, age, wealth_dec, long_wt_rp, mar_change_wealth) %>% 
  arrange(xwaveid, year) %>%
  filter(year %in% wealth_years ) %>% 
  mutate(wealth_shift = wealth_dec - lag(wealth_dec)) %>%   # numbers determine how many periods to lead or lag - retirement may effect wealthome in this period and one period into the future
  filter(age>=18, year!=2002) %>% 
  mutate(shift_size = case_when(
    wealth_shift >= 4 ~ "big up",
    wealth_shift < 4 & wealth_shift > 0 ~ "small up",
    wealth_shift == 0 ~ "no shift",
    wealth_shift <= -4 ~ "big down",
    wealth_shift > -4 & wealth_shift < 0 ~ "small down",
    TRUE ~ "wrong") ) %>% 
  group_by(shift_size, mar_change_wealth) %>% 
  summarise(group_total = sum(long_wt_rp)) %>% 
  ungroup() %>% 
  group_by(shift_size) %>% 
  mutate(perc = group_total/sum(group_total)) %>% View()





##################################################################################
# GINIS - average of 1 year ginis, and multiple year ginis. - 'How do we compare internationally in life course mobility?' section


# eq_disp_inc average GINI across 16 years.- 0.302
dat %>%
  select(year, long_wt, eq_disp_incp) %>% # only positive values
  group_by(year) %>% 
  summarise(gini = gini(eq_disp_incp, weights = long_wt)) %>%  
  ungroup() %>% 
  summarise(avg = mean(gini))

# GINI for income summed across 16 years - 0.249
dat %>%
  select(xwaveid, long_wt, eq_disp_incp) %>% # only positive values
  group_by(xwaveid) %>% 
  summarise(tot_inc = sum(eq_disp_incp),
            avg_long_wt = mean(long_wt)) %>% 
  ungroup() %>% 
  summarise(gini = gini(tot_inc, weights = avg_long_wt))


# CONSUMPTION average GINI across 10 years - 0.251
dat %>%
  select(year, long_wt, eq_cons) %>% 
  filter(year >= 2006, !is.na(eq_cons)) %>% 
  group_by(year) %>% 
  summarise(gini = gini(eq_cons, weights = long_wt)) %>%  
  ungroup() %>% 
  summarise(avg = mean(gini))


# GINI for consumption summed across 16 years - 0.221
dat %>%
  filter(year >= 2006) %>% 
  select(xwaveid, long_wt, eq_cons) %>% # only positive values
  group_by(xwaveid) %>% 
  summarise(tot_cons = sum(eq_cons, na.rm=FALSE), # discard people with na for consumption in any year
            avg_long_wt = mean(long_wt)) %>%
  ungroup() %>% 
  filter(!is.na(tot_cons)) %>% 
  summarise(gini = gini(tot_cons, weights = avg_long_wt))



################## Absolute v relative mobility section
# how many people are earning more in 2016 than in 2001? - 69%

dat %>% 
  select(xwaveid, year, long_wt, eq_disp_inc, age) %>% 
  filter(year %in% c(2001, 2016) ) %>% 
  arrange(xwaveid, year) %>% 
  mutate(inc_increase = ifelse(eq_disp_inc > lag(eq_disp_inc), 1, 0) ) %>% 
  filter(year==2016) %>% 
  # filter(age < 58) %>% 
  summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) )

# consumption - 2006 to 2016 - 59.8%
dat %>% 
  select(xwaveid, year, long_wt, eq_cons, age) %>% 
  filter(year %in% c(2006, 2016) ) %>% 
  arrange(xwaveid, year) %>% 
  mutate(inc_increase = ifelse(eq_cons > lag(eq_cons), 1, 0) ) %>% 
  filter(year==2016) %>% 
  # filter(age < 58) %>% 
  summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) )

# wealth - 2002 to 2014 - 71.1%
dat %>% 
  select(xwaveid, year, long_wt, eq_wealth, age) %>% 
  filter(year %in% c(2002, 2014) ) %>% 
  arrange(xwaveid, year) %>% 
  mutate(inc_increase = ifelse(eq_wealth > lag(eq_wealth), 1, 0) ) %>% 
  filter(year==2014) %>% 
  # filter(age < 58) %>% 
  summarise(perc_increase = sum(long_wt*inc_increase)/sum(long_wt) )



###################################################
# CHAPTER 4
#################################################




##########
# WEALTH ALL AGES by decade (AVERAGE) - Figure 4.13


# emf(file = here("HILDA","charts","mean_wealth_birth_decade.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  group_by(year, birth_decade) %>%
  filter(year %in% wealth_years, !is.na(birth_decade), birth_decade!="1910s") %>%
  summarise(wealth = weighted.mean(eq_wealth, long_wt), avg_age = weighted.mean(age, long_wt)) %>%     # used average of actual age.
  ggplot( aes(x = avg_age, y = wealth, group = birth_decade, col = birth_decade))+ 
  geom_line(size=1.05)+
  geom_point(size=2)+
  scale_y_continuous(name="Dollars", expand=c(0,0), limits = c(0,1000000), labels = comma_format(digits = 0) )+
  scale_x_continuous(expand = c(0.03, 0.03), breaks = c(seq(25,90,5)), limits = c(23,90))+
  scale_color_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.position = "none",
        legend.title.align = 0.5,
        legend.title = element_text()) +
  labs(x = "Average age of birth cohort", y= "Dollars", col = paste0("Decade",'\n', "of birth"))

# dev.off()




################################################################ 
# POVERTY PERSISTANCE - CHAPTER 6
###############################################################


# create poverty dataset and put in wide format
dat_pov <- dat %>% 
  select(xwaveid, year, long_wt, inc_poverty) %>% 
  spread(year, inc_poverty)

# rename years so non-numeric and easier to work with
dat_pov <- rename(dat_pov, y2001='2001', y2002='2002', y2003='2003', y2004='2004',y2005='2005', y2006='2006', y2007='2007', y2008='2008'
                  , y2009='2009', y2010='2010', y2011='2011', y2012='2012', y2013='2013', y2014='2014', y2015='2015', y2016='2016')

# data in long format
dat_pov2 <- dat %>% 
  select(xwaveid, year, long_wt, inc_poverty)


# how many in poverty in each year 
num_pov <- dat_pov2 %>%  
  group_by(year) %>% 
  summarise(average = wtd.mean(x=inc_poverty, weight=long_wt) )

# total population for this dataset
tot_pop <- dat_pov %>% 
  select(long_wt) %>% 
  sum()

# total years spent in poverty out of 16 years
dat_pov <- dat_pov %>% 
  mutate(pov_total = rowSums(select(., y2001:y2016 ) ) )


# average number of years spent in poverty - counting only those with at least one year spent in poverty
dat_pov %>% 
  filter(pov_total>=1) %>% 
  summarise(wtd.mean(pov_total, weight=long_wt))

# perc spending at least 1 year in poverty
one_plus <- dat_pov %>% 
  filter(pov_total>=1) %>% 
  select(long_wt) %>% 
  sum()
# percentage
one_plus/tot_pop

# perc spending at least 5 years in poverty total
five_plus <- dat_pov %>% 
  filter(pov_total>=5) %>% 
  select(long_wt) %>% 
  sum()
# percentage
five_plus/tot_pop

# percent spending 5 consecutive years in poverty

i = 2001
dat_pov <- mutate(dat_pov, run_01 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2002
dat_pov <- mutate(dat_pov, run_02 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2003
dat_pov <- mutate(dat_pov, run_03 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2004
dat_pov <- mutate(dat_pov, run_04 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2005
dat_pov <- mutate(dat_pov, run_05 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2006
dat_pov <- mutate(dat_pov, run_06 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2007
dat_pov <- mutate(dat_pov, run_07 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2008
dat_pov <- mutate(dat_pov, run_08 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2009
dat_pov <- mutate(dat_pov, run_09 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2010
dat_pov <- mutate(dat_pov, run_10 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2011
dat_pov <- mutate(dat_pov, run_11 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )
i = 2012
dat_pov <- mutate(dat_pov, run_12 = rowSums(select(dat_pov, num_range("y", i:(i+4) ) ) ) )


rm(i)

dat_pov <- mutate(dat_pov, pov_run = case_when (
  run_01 ==5 | run_02 ==5 | run_03 ==5 | run_04 ==5 | run_05 ==5 | run_06 ==5 | run_07 ==5 | run_08 ==5 | run_09 ==5 | run_10 ==5 | run_11 ==5 | run_12==5 ~ 1,
  TRUE ~ 0
) )

five_plus_run <- dat_pov %>% 
  filter(pov_run ==1) %>% 
  select(long_wt) %>% 
  sum()

five_plus_run/tot_pop


# percent who have been in poverty for this year and the previous 3 years. last 4 years - 3.57%  (exec summary)

dat_pov <- mutate(dat_pov, last_4_run = ifelse(y2013==1 & y2014==1 & y2015==1 & y2016==1, 1, 0))

last_4_num <- dat_pov %>% 
  select(long_wt, last_4_run) %>% 
  filter(last_4_run ==1) %>% 
  summarise(sum(long_wt))

last_4_num/tot_pop





# probability that a random person spends the first five years in poverty
dat_pov %>%
  summarise(perc = ( sum(if_else(run_01==5, 1, 0)*long_wt)) /sum(long_wt) )



# proportion in poverty in 2001 who are ALSO (not still) in poverty in 2016
pov01 <- dat_pov %>% 
  filter(y2001==1) %>% 
  select(long_wt) %>% 
  sum()

pov0116 <- dat_pov %>%
  filter(y2001==1 & y2016==1) %>% 
  select(long_wt) %>% 
  sum()

pov0116/pov01

rm(pov01, pov0116)

# proportion in poverty in 2001 who are ALSO (not still) in poverty in 2006
pov01 <- dat_pov %>% 
  filter(y2001==1) %>% 
  select(long_wt) %>% 
  sum()

pov0106 <- dat_pov %>%
  filter(y2001==1 & y2006==1) %>% 
  select(long_wt) %>% 
  sum()

pov0106/pov01

rm(pov01, pov0106)

# proportion in poverty in 2006 who are ALSO (not still) in poverty in 2011
pov06 <- dat_pov %>% 
  filter(y2006==1) %>% 
  select(long_wt) %>% 
  sum()

pov0611 <- dat_pov %>%
  filter(y2006==1 & y2011==1) %>% 
  select(long_wt) %>% 
  sum()

pov0611/pov06

rm(pov06, pov0611)

# proportion in poverty in 2011 who are ALSO (not still) in poverty in 2016
pov11 <- dat_pov %>% 
  filter(y2011==1) %>% 
  select(long_wt) %>% 
  sum()

pov1116 <- dat_pov %>%
  filter(y2011==1 & y2016==1) %>% 
  select(long_wt) %>% 
  sum()

pov1116/pov11

rm(pov11, pov1116)


# What proportion of OF THOSE WHO did not start out HILDA in poverty, moved into it after 2001?

no_pov01 <- dat_pov %>%
  filter(y2001==0) %>% 
  select(long_wt) %>% 
  sum()

# total years spent in poverty out of 15 (not 2001)
dat_pov <- dat_pov %>% 
  mutate(pov_total_excl01 = rowSums(select(., y2002:y2016 ) ) )

# perc spending at least 1 year in poverty
one_plus_excl01 <- dat_pov %>% 
  filter(pov_total_excl01 >= 1) %>% 
  select(long_wt) %>% 
  sum()
# percentage
one_plus_excl01/no_pov01


##################
# How many people move out of poverty, but then move back in? - looking for a 1,0,1 pattern in income poverty, which equate to a -1,1 pattern in pov_change

dat_pov <- dat %>% 
  select(xwaveid, year, long_wt, inc_poverty) %>% 
  spread(year, inc_poverty)

dat_pov2 <- dat %>% 
  select(xwaveid, long_wt, year, inc_poverty)

dat_pov2 <- dat_pov2 %>% 
  mutate(pov_change = case_when (
    year==2001 ~ as.numeric(0),
    TRUE ~ inc_poverty - lag(inc_poverty)  # 1 signifies a move into poverty, -1 signifies a move out
  ) )

dat_change <- dat_pov2 %>% 
  select(-inc_poverty) %>% 
  filter(!year==2001) %>%  # exclude 2001 because not possible to record change in poverty status
  spread(year, pov_change)

# rename years so non-numeric and easier to work with
dat_change <- rename(dat_change, y2002='2002', y2003='2003', y2004='2004',y2005='2005', y2006='2006', y2007='2007', y2008='2008'
                     , y2009='2009', y2010='2010', y2011='2011', y2012='2012', y2013='2013', y2014='2014', y2015='2015', y2016='2016')

# unite into one string
dat_string <- unite(dat_change, change_string, y2002:y2016, sep="")

#sub N for eNter poverty and X for eXit poverty
dat_string <- mutate(dat_string, stringB = gsub(pattern="-1", replacement="X", dat_string$change_string))
dat_string <- mutate(dat_string, string = gsub(pattern="1", replacement="N", dat_string$stringB))
dat_string <- select(dat_string, xwaveid, long_wt, string)

# match pattern X first then N in string
dat_string <-  mutate(dat_string, out_in = grepl(pattern = "X.*N", x=dat_string$string) )

### what proportion of all people move out of poverty and then back in (population weighted)
summarise(dat_string, wtd.mean(out_in, weight=long_wt))

# match pattern at least one X
dat_string <-  mutate(dat_string, out = grepl(pattern = "X", x=dat_string$string) )
# ccould use str_detect (stringer package) instead - returns TRUE, FALSE, FALSE

# weighted sum of outs and out_ins
total_out <- dat_string %>% 
  filter(out==TRUE) %>% 
  summarise(sum(long_wt))

total_out_in <- dat_string %>% 
  filter(out_in==TRUE) %>% 
  summarise(sum(long_wt))

### What proportion of respondents who moved out of poverty during HILDA later moved back into poverty?
total_out_in/total_out

##################################
# For people who move out of poverty (X) and then back in (N), what is average amount of time spent out of poverty

# only include those people with a match
dat_out_in <- dat_string %>%
  filter(out_in==TRUE)

# count number of matching strings
dat_out_in <- dat_out_in %>% 
  mutate(count = str_count(string=.$string, pattern = "X0*N"))

# extract all strings - including multiple strings from those people that have more than one
dat_out_in <- dat_out_in %>% 
  mutate(match = str_extract_all(string=.$string, pattern = "X0*N"))

# make each string an observation
dat_out_in <- unnest(dat_out_in, match)

# count lengths of extracted strings
# substract one from string length - want number of periods spent out, not length of string (which includes period in which they returned)
dat_out_in <- dat_out_in %>% 
  mutate(time_out = str_length(string=.$match) - 1 ) 

# calcualted weighted mean of string length
wtd.mean(dat_out_in$time_out, weight=dat_out_in$long_wt)

###################
# For people who exit poverty (X) and then re-enter (N), how long are their spells out of poverty
# 'about one half spend two or more years out of poverty' text in chapter 6.

dat_out_in %>% 
  group_by(time_out) %>% 
  summarise(num = sum(long_wt)) %>%
  mutate(perc = num/sum(num)) %>%
  mutate(perc_culm = cumsum(perc)) 



##################################
# For people who move into poverty(N) and then out (X), what is average amount of time spent in poverty

# count number of matches in string
dat_string <- mutate(dat_string, in_out = str_count(string=dat_string$string, pattern="N0*X") )

# create dataframe just of those people with 1 or more match
dat_in_out <- dat_string %>% 
  select(xwaveid, long_wt, string, in_out) %>% 
  filter(in_out>=1)

# extract strings- including multiple strings from those people that have more than one
dat_in_out <- dat_in_out %>% 
  mutate(match = str_extract_all(string=.$string, pattern = "N0*X"))

# make each string an observation
dat_in_out <- unnest(dat_in_out, match)

# count lengths of extracted strings
# substract one from string length - want number of periods spent in, not length of string (which includes period in which they returned)
dat_in_out <- dat_in_out %>% 
  mutate(time_in = str_length(string=.$match) - 1 ) 

# calcualted weighted mean of time in
wtd.mean(dat_in_out$time_in, weight=dat_in_out$long_wt)

##############################################
# SURVIVAL FUNCTIONS

# For people who move into poverty (N) whether or not they move out, how long to they spend in poverty - for SURVIVAL FUNCTION

# remove excess stuff from dataframe
dat_spell <- dat_string %>% 
  select(xwaveid, long_wt, string)

# count number of spells lasting at least x years
dat_spell <- dat_spell %>% 
  mutate("1" = str_count(string=dat_spell$string,  pattern="N") ,
         "2"  = str_count(string=dat_spell$string, pattern="N0"),
         "3"  = str_count(string=dat_spell$string, pattern="N00") ,
         "4"  = str_count(string=dat_spell$string, pattern="N000") ,
         "5"  = str_count(string=dat_spell$string, pattern="N0000") ,
         "6"  = str_count(string=dat_spell$string, pattern="N00000") ,
         "7"  = str_count(string=dat_spell$string, pattern="N000000") ,
         "8"  = str_count(string=dat_spell$string, pattern="N0000000") ,
         "9"  = str_count(string=dat_spell$string, pattern="N00000000") ,
         "10" = str_count(string=dat_spell$string, pattern="N000000000") ,
         "11" = str_count(string=dat_spell$string, pattern="N0000000000") ,
         "12" = str_count(string=dat_spell$string, pattern="N00000000000") ,
         "13" = str_count(string=dat_spell$string, pattern="N000000000000") ,
         "14" = str_count(string=dat_spell$string, pattern="N0000000000000") ,
         "15" = str_count(string=dat_spell$string, pattern="N00000000000000") ,
         "16" = str_count(string=dat_spell$string, pattern="N000000000000000") )

# generate survival percentage - "% of poverty spells lasting at least x years"
survival <-  dat_spell %>% 
  select(-xwaveid, -string) %>% 
  gather(length, count, -long_wt) %>% 
  group_by(length) %>%
  summarise(pop = sum(count*long_wt)) %>% 
  mutate(length=as.numeric(length)) %>% 
  arrange(length) %>% 
  mutate(perc = 100*(pop/pop[1]) ) %>% 
  filter(length !=1)


# plot survival function
survival %>% 
  ggplot(aes(x=length, y=perc)) +
  geom_point(col=blue) +
  geom_line(col=blue) +
  scale_x_continuous(breaks = seq(2,16,1), name="Lasting at least this many years") +
  ylab("Percentage of poverty spells")




##############################################################################################
# VOLATILITY OF INCOME and CONSUMPTION - CHAPTER 6
#####################################################################################

# Notes from Hardy 2016 - Sample is restricted to household heads between ages 25 and 60 that have non-negative pre-tax & pre-transfer income, 
# without imputed income, and excluding the top 5% of income over each annual sample to exclude topcodes.
# percentage change formula follows Hardy 2016 and Dynan, Elmendorf and Sichel 2012

# should be no top coding for income. for consumption?

#The top-coding thresholds are adjusted over time to overcome the tendency of income and wealth measures to inflate. 
#If you need to know the threshold values that have been used at a particular release, see “HILDA-thresholds-by-wave.csv” in the release documentation.


# INCOME
# positive values for eq_disp_inc only (p suffix)
dat <- dat %>%
  arrange(xwaveid, year) %>% 
  mutate(perc_change_incp = 100*( (eq_disp_incp - lag(eq_disp_incp) ) / ((abs(eq_disp_incp) + abs(lag(eq_disp_incp)))/2) ) )

# CONSUMPTION
dat <- dat %>%
  arrange(xwaveid, year) %>% 
  mutate(perc_change_cons = 100*( (eq_cons - lag(eq_cons) ) / ((abs(eq_cons) + abs(lag(eq_cons)))/2) ) )

#############
# VOLATILITY OF INCOME AND CONSUMPTION - POVERTY V NO POVERTY

# group by whether or not in income poverty

# consumption and income variability - averaged over 2006-2016
dat %>% 
  filter(year>2006, age>=25, age<60) %>%  #  can't calculate income change for 2006.
  group_by(year, inc_poverty) %>% 
  summarise(cons_sd = sqrt(wtd.var(x=perc_change_cons, weights = long_wt) ),
            inc_sd = sqrt(wtd.var(x=perc_change_incp, weights = long_wt)  ) ) %>% 
  gather(key = "measure", value="value", -year, -inc_poverty) %>% 
  ungroup() %>% 
  group_by(inc_poverty, measure) %>% 
  summarise(average = mean(value)) %>% View()





######################################################################
# POVERTY (SAMPLE 2013 to 2016)
######################################################################


# use long_13thru16 sample for this


# create poverty dataset and put in wide format
dat_pov13 <- dat %>% 
  select(xwaveid, year, wlem_p, inc_poverty) %>% 
  rename(long_wt_last4 = wlem_p) %>%   # weights for those enumerated in the last 4 years
  spread(year, inc_poverty)

# rename years so non-numeric and easier to work with
dat_pov13 <- rename(dat_pov13, y2013='2013', y2014='2014', y2015='2015', y2016='2016')


# total population for this dataset
tot_pop13 <- dat_pov13 %>% 
  summarise(sum(long_wt_last4)) 


# percent who have been in poverty for this year and the previous 3 years. last 4 years - 3.84% (compared to 3.57% using full sample)  (exec summary)

dat_pov13 <- mutate(dat_pov13, last_4_run = ifelse(y2013==1 & y2014==1 & y2015==1 & y2016==1, 1, 0))

last_4_num13 <- dat_pov13 %>% 
  select(long_wt_last4, last_4_run) %>%  
  filter(last_4_run ==1) %>% 
  summarise(sum(long_wt_last4))

# percent of population
last_4_num13/tot_pop13

# percent of population as at June 2016 ((http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/3101.0Dec%202017?OpenDocument)
24190907*(last_4_num13/tot_pop13)