# Productivity Commission 2018 analysis for the Rising Inequality? Commission Research Paper

# Cross-sectional HILDA analysis

# Need to run the top part of code, to get income, wealth, consumption deciles, etc. (up to line ~160) then can run bit by bit.


#############################################################
# INCOME DECILES
#############################################################

# weighted cut points       
cuts_wtd <- dat %>%
  group_by(year)  %>%
  do(data.frame(t(wtd.quantile(.$eq_disp_inc, weights=.$hh_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>%    # the t is a transpose
  ungroup() %>% 
  rename(cut0=X..0., cut10=X.10., cut20=X.20., cut30=X.30., cut40=X.40., cut50=X.50., cut60=X.60., cut70=X.70., cut80=X.80., cut90=X.90., cut100=X100.)

# add in 50% of income variable
cuts_wtd <- mutate(cuts_wtd, inc_poverty_line = 0.5*cut50)

# join weighted cut points to the dataframe
dat <- left_join(dat, cuts_wtd, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat <-  dat %>%
  mutate(inc_dec = case_when(
    eq_disp_inc <= cut10                       ~ 1,
    eq_disp_inc > cut10 & eq_disp_inc <= cut20 ~ 2,
    eq_disp_inc > cut20 & eq_disp_inc <= cut30 ~ 3,
    eq_disp_inc > cut30 & eq_disp_inc <= cut40 ~ 4,
    eq_disp_inc > cut40 & eq_disp_inc <= cut50 ~ 5,
    eq_disp_inc > cut50 & eq_disp_inc <= cut60 ~ 6,
    eq_disp_inc > cut60 & eq_disp_inc <= cut70 ~ 7,
    eq_disp_inc > cut70 & eq_disp_inc <= cut80 ~ 8,
    eq_disp_inc > cut80 & eq_disp_inc <= cut90 ~ 9,
    eq_disp_inc > cut90                        ~ 10,
    TRUE ~ -99
  )
  )

# write to csv for use externally
# HILDA_cuts_wtd <- cuts_wtd %>%
#   select(-inc_poverty_line)
# write.csv(x=HILDA_cuts_wtd, file= here("shared_dataframes", "HILDA_cuts_wtd.csv"), row.names=FALSE)
# rm(HILDA_cuts_wtd)


# assign each observation a 1 if equivalised household disposable income less than half of median, 0 otherwise
dat <- dat %>%
  mutate(inc_poverty = case_when(
    eq_disp_inc < inc_poverty_line ~ 1,
    eq_disp_inc >= inc_poverty_line ~ 0,
    TRUE ~ -99)
  )

# Income ratios
cuts_wtd <-  mutate(cuts_wtd, ninety10 = cut90/cut10)
cuts_wtd <-  mutate(cuts_wtd, ninety50 = cut90/cut50)
cuts_wtd <-  mutate(cuts_wtd, fifty10 = cut50/cut10)


#################################################################
# WEALTH DECILES
##############################################################

# weighted cut points       
cuts_wealth <- dat %>%
  group_by(year)  %>%
  filter(year %in% wealth_years) %>%
  do(data.frame(t(wtd.quantile(.$eq_wealth, weights=.$hh_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>% 
  ungroup() %>% 
  rename(wealth_cut0=X..0., wealth_cut10=X.10., wealth_cut20=X.20., wealth_cut30=X.30., wealth_cut40=X.40., wealth_cut50=X.50., 
         wealth_cut60=X.60., wealth_cut70=X.70., wealth_cut80=X.80., wealth_cut90=X.90., wealth_cut100=X100.)
                               

# join weighted cut points to the dataframe
dat <- left_join(dat, cuts_wealth, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat <-  dat %>%
  mutate(wealth_dec = case_when(
    eq_wealth <= wealth_cut10                       ~ 1,
    eq_wealth > wealth_cut10 & eq_wealth <= wealth_cut20 ~ 2,
    eq_wealth > wealth_cut20 & eq_wealth <= wealth_cut30 ~ 3,
    eq_wealth > wealth_cut30 & eq_wealth <= wealth_cut40 ~ 4,
    eq_wealth > wealth_cut40 & eq_wealth <= wealth_cut50 ~ 5,
    eq_wealth > wealth_cut50 & eq_wealth <= wealth_cut60 ~ 6,
    eq_wealth > wealth_cut60 & eq_wealth <= wealth_cut70 ~ 7,
    eq_wealth > wealth_cut70 & eq_wealth <= wealth_cut80 ~ 8,
    eq_wealth > wealth_cut80 & eq_wealth <= wealth_cut90 ~ 9,
    eq_wealth > wealth_cut90                        ~ 10,
    TRUE ~ -99
  )
  )

# Wealth ratios
cuts_wealth <-  mutate(cuts_wealth, ninety10 = wealth_cut90/wealth_cut10)
cuts_wealth <-  mutate(cuts_wealth, ninety50 = wealth_cut90/wealth_cut50)
cuts_wealth <-  mutate(cuts_wealth, fifty10 = wealth_cut50/wealth_cut10)


#############################################################################
# CONSUMPTION DECILES
###########################################################################

# CREATE CONSUMPTION DATAFRAME
# drop those not appropriate for consumption - generate new consumption data frame (observations before 2006 and those with negative (missing) rent)
dat_cons <- dat %>% 
  filter(year >= 2006) %>% 
  subset(eq_rent >= 0) 

##
# CONSUMPTION DECILES

# weighted cut points       
cuts_cons <- dat_cons %>%
  group_by(year)  %>%
  do(data.frame(t(wtd.quantile(.$eq_cons, weights=.$hh_wt, probs=seq(from = 0, to = 1, by = 0.1) ) ) ) ) %>% 
  ungroup() %>% 
  rename(cons_cut0=X..0., cons_cut10=X.10., cons_cut20=X.20., cons_cut30=X.30., cons_cut40=X.40., cons_cut50=X.50., 
         cons_cut60=X.60., cons_cut70=X.70., cons_cut80=X.80., cons_cut90=X.90., cons_cut100=X100.)


# add in 50% of income variable
cuts_cons <- mutate(cuts_cons, cons_poverty_line = 0.5*cons_cut50)

# join weighted cut points to the dataframe
dat_cons <- left_join(dat_cons, cuts_cons, by="year")

# assign each observation to a decile basked depending on relevant weighted cut points
dat_cons <-  dat_cons %>%
  mutate(cons_dec = case_when(
    eq_cons <= cons_cut10                       ~ 1,
    eq_cons > cons_cut10 & eq_cons <= cons_cut20 ~ 2,
    eq_cons > cons_cut20 & eq_cons <= cons_cut30 ~ 3,
    eq_cons > cons_cut30 & eq_cons <= cons_cut40 ~ 4,
    eq_cons > cons_cut40 & eq_cons <= cons_cut50 ~ 5,
    eq_cons > cons_cut50 & eq_cons <= cons_cut60 ~ 6,
    eq_cons > cons_cut60 & eq_cons <= cons_cut70 ~ 7,
    eq_cons > cons_cut70 & eq_cons <= cons_cut80 ~ 8,
    eq_cons > cons_cut80 & eq_cons <= cons_cut90 ~ 9,
    eq_cons > cons_cut90                        ~ 10,
    TRUE ~ -99
  )
  )


# join consumption deciles to main frame
cons_join <- dat_cons %>% 
  select(xwaveid, year, cons_dec)

dat <- left_join(dat, cons_join, by=c("xwaveid","year"))

rm(cons_join)

############################















########################################################################################################
# BEGINNING OF OPTIONAL ANALYSIS - CAN BE DONE BIT BY BIT
######################################################################################################

























#######################################################################
# CHAPTER 3 - INCOME
#######################################################################

#################
# figure 3.1- median and mean income for combined chart - HILDA combine with HES            (figure 3.1)

HILDA_income <- dat %>%
  select(year, eq_disp_inc, hh_wt) %>% 
  group_by(year) %>% 
  summarise(mean = wtd.mean(eq_disp_inc, weights=hh_wt),
            median = wtd.quantile(eq_disp_inc, weights=hh_wt, probs=0.5) ) %>% 
  mutate(source="HILDA") 

# saveRDS(HILDA_income, file= here("shared_dataframes", "HILDA_income.rds") )


################
# figure 3.2 - GINI COEFFICIENTS (to feed into HES/SIH code)                                               (figure 3.2)

ginis <- dat %>%
  select(year, hh_wt, eq_disp_inc, eq_gross_inc, eq_priv_inc, eq_lab_inc, eq_wealth, eq_cons) %>%
  gather(key = "variable", value = "value", c(-year, -hh_wt)) %>%
  mutate(value = ifelse(value >= 0, value, 0)) %>%                         # treat negative numbers as zeroes
  group_by(year, variable) %>%
  summarise(gini = gini(value, weights = hh_wt))

# saveRDS(ginis, file= here("shared_dataframes", "HILDA_ginis.rds") )


#############
# INCOME quantile RATIOS - for HES check                                           (figure 3.3)
cuts_wtd %>% 
  select(year, ninety10, ninety50, fifty10) %>%
  gather(ratio, value, -year) %>% 
  ggplot(aes(x=year, y=value, col=ratio) ) +
  geom_line(size=2)



  
########
# disposable income ginis using two equivalisation methods (OECD_mod and square root) - for HES check    (feeds into figure 3.4)
ginis <- dat %>%
  select(year, hh_wt, eq_disp_inc, sqrt_eq_disp_inc) %>% 
  gather(key = "variable", value = "value", c(-year, -hh_wt)) %>%
  mutate(value = ifelse(value >= 0, value, 0)) %>%    # treat all negative values as 0
  group_by(year, variable) %>%
  summarise(gini = gini(value, weights = hh_wt)) %>% 
  spread(key=variable, value=gini, drop=FALSE)




# figure 3.5 - HILDA not available for time period


##############
# figure 3.6 - Income growth rates vary by period - for HES check     (figure 3.6)

dat %>%
  select(year,hh_wt, inc_dec, eq_disp_inc) %>%
  group_by(year, inc_dec) %>%
  summarise(mean_value = (wtd.mean(eq_disp_inc, weight = hh_wt))) %>%
  ungroup() %>%
  arrange(inc_dec,year) %>%
  filter(year %in% c(2004, 2010, 2016)) %>% 
  mutate(percent=(((mean_value/lag(mean_value))^(1/(year-lag(year)))) - 1)*100)  %>%
  filter(year!=2004) %>% 
  ggplot(aes(x = factor(year), y = percent, fill = inc_dec, group = inc_dec))+
  geom_col(position="dodge")+
  scale_y_continuous(labels = comma, expand=c(0.01,0.01))+
  scale_x_discrete(expand = c(0.01, 0.01))+
  labs(x = "HES period", y = "Per cent", fill = "Income decile:")+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text(),
        axis.line.x = element_blank())+
  geom_hline(yintercept = 0, colour = grey)


# figure 3.7 - HILDA not available for time period


#################
# TOP INCOME CHARTS                                       (figure 3.8)

#Top 1 per cent column
dat <- dat %>% 
  group_by(year) %>%
  mutate(top_one_per_cent = ifelse(eq_disp_inc > wtd.quantile(eq_disp_inc, weights=hh_wt, probs=0.99), 1, 0)) %>%
  mutate(ninety_nine_per_cent = abs(top_one_per_cent-1))

#Top 1 per cent mean income growth - 2.3% p.a. 2001-2016
dat %>% 
  group_by(year) %>%
  summarise(mean_value = wtd.mean(eq_disp_inc, weights=hh_wt*top_one_per_cent)) %>% 
  mutate(one_per_cent_growth = (((mean_value/lag(mean_value,15))^(1/(year-lag(year,15)))) - 1)*100)

# income growth for whole population - 1.9%
dat %>% 
  group_by(year) %>%
  summarise(mean_value = wtd.mean(eq_disp_inc, weights=hh_wt)) %>% 
  mutate(one_per_cent_growth = (((mean_value/lag(mean_value,15))^(1/(year-lag(year,15)))) - 1)*100)

#Top 1 per cent income share - 4.8% in 2016
dat %>% 
  select(year, hh_wt, eq_disp_inc, top_one_per_cent) %>%
  group_by(year) %>%
  summarise(share = sum(eq_disp_inc*hh_wt*top_one_per_cent)/sum(eq_disp_inc*hh_wt)*100)


#Gini comparison with and without top 1 per cent - no negative values
dat %>%
  select(year, hh_wt, eq_disp_incp) %>%
  group_by(year) %>%
  mutate(ninety_nine_per_cent_indicator = ifelse(eq_disp_incp < wtd.quantile(eq_disp_incp, weights=hh_wt, probs=0.99), 1, 0)) %>%
  summarise(total_gini = gini(eq_disp_incp, weights = hh_wt),
            ninety_nine_per_cent_gini = gini(eq_disp_incp, weights = hh_wt*ninety_nine_per_cent_indicator),
            per_cent_difference = (total_gini - ninety_nine_per_cent_gini)/total_gini*100)



###########
# Figure 3.9 - Income mostly comes from labour for most households              (Figure 3.9)

dat %>% 
  select(year,hh_wt, inc_dec, eq_lab_inc, eq_cap_inc, eq_trans_inc, eq_tax, eq_gross_inc) %>%
  filter(year==2016) %>%
  gather(inc_type,value,c(-year,-inc_dec, -hh_wt, -eq_gross_inc)) %>%
  group_by(year, inc_type,inc_dec) %>% 
  summarise(percent = 100*sum(value*hh_wt)/sum(eq_gross_inc*hh_wt)) %>% 
  ggplot(aes(x = factor(inc_type), y = percent, fill = inc_dec, group = inc_dec))+
  geom_col(position = "dodge")+
  scale_x_discrete( expand = c(0.01, 0.01))+
  scale_y_continuous(expand=c(0.0,0.0), limits = c(-40,120))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text(),
        axis.line.x = element_blank())+
  labs(x="Income type", y = "Per cent", fill="Income decile:")+
  geom_hline(yintercept = 0, colour = grey)+ #Add this to get a horizontal axis line at zero
  guides(fill=guide_legend(nrow=1, byrow=TRUE))



# Figure 3.10 - HILDA doesn't cover this span of time



###################
# Figure 3.11 - Taxes and transfers consistently reduce inequality

dat %>%
  select(year, hh_wt, eq_disp_inc, eq_gross_inc, eq_priv_inc) %>%
  gather(key = "variable", value = "value", c(-year, -hh_wt)) %>%
  mutate(value = ifelse(value >= 0, value, 0)) %>%                         # treat negative numbers as zeroes
  group_by(year, variable) %>%
  summarise(gini = gini(value, weights = hh_wt)) %>% 
  ggplot(aes(x=year, y=gini, col= variable, group=variable)) +
  geom_line()


#################
# Figure 3.12 - Changes in labour and transfer income have driven overall trends in inequality. marginal effect on the gini.      (Figure 3.12)


##Marginal effects of income components

#Effect of income tax on disposable income

marginal_disp_ginis <- dat %>%
  select(year, hh_wt, eq_disp_inc, eq_gross_inc) %>%
  mutate(eq_disp_inc_non_zero = ifelse(eq_disp_inc > 0, eq_disp_inc,0)) %>%
  mutate(eq_gross_inc_non_zero = ifelse(eq_gross_inc > 0, eq_gross_inc,0)) %>%
  group_by(year) %>%
  summarise(eq_disp_inc_gini = gini(eq_disp_inc_non_zero, weights = hh_wt),
            eq_gross_inc_gini = gini(eq_gross_inc_non_zero, weights = hh_wt),
            marginal_effect = eq_disp_inc_gini-eq_gross_inc_gini)


#Effect of labour, capital and transfers on gross income

marginal_gross_ginis <- dat %>%
  select(year, hh_wt, eq_gross_inc, eq_lab_inc, eq_cap_inc, eq_trans_inc) %>%
  mutate(gross_total = ifelse(eq_gross_inc > 0, eq_gross_inc,0),
         gross_less_lab = ifelse(eq_gross_inc - eq_lab_inc > 0, eq_gross_inc - eq_lab_inc,0),
         gross_less_cap = ifelse(eq_gross_inc - eq_cap_inc > 0, eq_gross_inc - eq_cap_inc,0),
         gross_less_trans = ifelse(eq_gross_inc - eq_trans_inc > 0, eq_gross_inc - eq_trans_inc,0)) %>%
  gather(key = "variable", value = "value", c(gross_less_lab, gross_less_cap, gross_less_trans)) %>%
  group_by(year, variable) %>%
  summarise(gross_gini = gini(gross_total, weights = hh_wt),
            gross_less_gini = gini(value, weights = hh_wt),
            marginal_effect = gross_gini-gross_less_gini)


#Combining all marginal effects

marginal_disp_ginis <- marginal_disp_ginis %>% 
  mutate(variable = "disp_less_tax") 

marginal_ginis <- bind_rows(marginal_gross_ginis,marginal_disp_ginis)


#figure 3.12a

# emf(file = here("HES","Charts","marginal_gini_effects_tax.emf"), width = 2.95276, height = 2.3622)

marginal_ginis %>%
  filter(variable=="disp_less_tax") %>%
  ggplot(aes(x=year,y=marginal_effect, col = variable)) +
  geom_line(size=1)+
  geom_point(size=2)+
  #  scale_x_continuous(breaks = HES_years, labels = short_fin_year_labels, position = "top")+
  scale_color_manual(values=PCcols)+
  xlab("Year")+
  ylab("Change in Gini coefficient")+
  ylim(-0.1,0)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        strip.background = element_rect(fill = grey),
        strip.text = element_text(face = "bold"),
        panel.border = element_rect(colour = grey, fill = NA),
        legend.position = "none")

# dev.off()

#figure 3.12b

# emf(file = here("HES","Charts","marginal_gini_effects_capital.emf"), width = 2.95276, height = 2.3622)

marginal_ginis %>%
  filter(variable=="gross_less_cap") %>%
  ggplot(aes(x=year,y=marginal_effect, col = variable)) +
  geom_line(size=1)+
  geom_point(size=2)+
  # scale_x_continuous(breaks = HES_years, labels = short_fin_year_labels, position = "top")+
  scale_color_manual(values=PCcols)+
  xlab("Year")+
  ylab("Change in Gini coefficient")+
  ylim(-0.1,0)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        strip.background = element_rect(fill = grey),
        strip.text = element_text(face = "bold"),
        panel.border = element_rect(colour = grey, fill = NA),
        legend.position = "none")

# dev.off()

#figure 3.12c

# emf(file = here("HES","Charts","marginal_gini_effects_labour.emf"), width = 2.95276, height = 2.3622)

marginal_ginis %>%
  filter(variable=="gross_less_lab") %>%
  ggplot(aes(x=year,y=marginal_effect, col = variable)) +
  geom_line(size=1)+
  geom_point(size=2)+
  #  scale_x_continuous(breaks = HES_years, labels = short_fin_year_labels, position = "top")+
  scale_color_manual(values=PCcols)+
  xlab("Year")+
  ylab("Change in Gini coefficient")+
  ylim(-0.35,-0.25)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        strip.background = element_rect(fill = grey),
        strip.text = element_text(face = "bold"),
        panel.border = element_rect(colour = grey, fill = NA),
        legend.position = "none")

# dev.off()

#figure 3.12d

# emf(file = here("HES","Charts","marginal_gini_effects_transfer.emf"), width = 2.95276, height = 2.3622)

marginal_ginis %>%
  filter(variable=="gross_less_trans") %>%
  ggplot(aes(x=year,y=marginal_effect, col = variable)) +
  geom_line(size=1)+
  geom_point(size=2)+
  scale_x_continuous(breaks = HES_years,  position = "top")+
  scale_color_manual(values=PCcols)+
  xlab("Year")+
  ylab("Change in Gini coefficient")+
  ylim(-0.15,-0.05)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        strip.background = element_rect(fill = grey),
        strip.text = element_text(face = "bold"),
        panel.border = element_rect(colour = grey, fill = NA),
        legend.position = "none")

# dev.off()




# figure 3.13 - HILDA doesn't ahve the same welfare payment splits



#########################################################################
# Section 3.3 - the demographics of the income distribution


###########
# Figure 3.14 - Representation in income deciles varies by age - check HES      (Figure 3.14)

# Share of people of a given age in each income decile, 2015-16

dat %>% 
  filter(year==2016) %>% 
  group_by(inc_dec, age_group_string) %>% 
  summarise(pop = sum(hhwt)) %>% 
  ungroup() %>% 
  group_by(age_group_string) %>% 
  mutate(percent = pop/sum(pop)) %>% 
  ggplot(aes(x=factor(age_group_string), y=percent*100, fill=inc_dec, group=inc_dec ) ) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1))+
  scale_y_continuous(expand=c(0,0), limits = c(0,25))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Age group", y = "Per cent", fill = "Income decile")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))



#############
# Figure 3.15 - Young people have had the worst of recent weak growth - check HES       (Figure 3.15)

dat %>%
  select(year, hh_wt, eq_disp_inc, age_group) %>%
  filter(!is.na(age_group)) %>%
  group_by(year, age_group) %>%
  summarise(mean_value = (wtd.mean(eq_disp_inc, weight = hh_wt))) %>%
  ungroup() %>%
  arrange(age_group,year) %>%
  filter(year %in% c(2004,2010,2016)) %>% 
  mutate(percent=(((mean_value/lag(mean_value))^(1/(year-lag(year)))) - 1)*100) %>% 
  filter(year %in% c(2010,2016)) %>% 
  arrange(year) %>% 
  ggplot(aes(x = factor(year), y = percent, fill = age_group, group = age_group))+
  geom_col(position="dodge")+
  scale_y_continuous(labels = comma, expand=c(0.01,0.01))+
  scale_x_discrete(breaks = HES_years, labels = HESP_period_labels, expand = c(0.01, 0.01))+
  labs(x = "HES period", y = "Per cent", fill = "Age group:")+
  scale_fill_manual(values = PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text(),
        axis.line.x = element_blank())+
  geom_hline(yintercept = 0, colour = grey)+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))
  


############
#Mean disposable income by age and birth decade - as per HES    (figure 3.16)

birth_decade_data <- dat %>% group_by(year, birth_decade) %>%
  filter(!is.na(birth_decade)) %>%
  summarise(disp_inc = weighted.mean(eq_disp_inc, hh_wt), age = weighted.mean(age, hh_wt)) %>%   # used average of the ages of cohort by birth decade
  filter(age <= 65)

ggplot(birth_decade_data, aes(x = age, y = disp_inc, group = birth_decade, col = birth_decade))+
  geom_point()+
  geom_line()+
  scale_y_continuous(expand=c(0,0), limits = c(0,70000), labels = comma)+
  scale_x_continuous(expand = c(0.01, 0.01), breaks = c(seq(15,65,5)), limits = c(15,65))+
  scale_color_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        axis.title.y = element_blank(),
        legend.position = "none",
        legend.title.align = 0.5,
        legend.title = element_text()) +
  labs(x = "Age", col = paste0("Decade",'\n', "of birth")) +
  geom_label_repel(aes(label = birth_decade),
                   data = subset(birth_decade_data, year == 2016 | (birth_decade == "1940s" & year == 2010)),
                   nudge_x = 0,
                   nudge_y = 3100,
                   na.rm = TRUE,
                   label.size = NA)


#############
# Figure 3.17 - group by household type and show proportion in each decile - INCOME (income dec)      (Figure 3.17)
dat %>%
  filter(year==2016) %>%  
  group_by(household_type, inc_dec) %>%
  summarise(group_total = sum(hh_wt)) %>%
  ungroup() %>%
  group_by(household_type) %>%
  mutate(percent = group_total/sum(group_total)*100)%>%
  ggplot(aes(x = household_type, y=percent, fill=inc_dec, group=inc_dec)) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1), labels = household_type_labels_spaced)+
  scale_y_continuous(expand=c(0,0), breaks = seq(0,60,10))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Household type", y = "Per cent", fill = "Income decile:")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))
  

#######################################################################
# Section 3.4


# Figure 3.18  - Average final equivalised consumption by income decile - can't do in HILDA as no in kind transfers, also no savings.


#########
# figure 3.19 - income and consumption by income deciles      (figure 3.19) - can't do in HILDA as no in kind transfers, also no savings.


dat %>%
  select(year, hhwt, inc_dec, eq_disp_inc, eq_cons) %>%
  gather(variable, value, -year, -hhwt, -inc_dec) %>% 
#  mutate(variable = factor(variable, levels = consumption_levels)) %>%
  filter(year==2016) %>% 
  group_by(variable, inc_dec) %>%
  summarise(mean_value = (wtd.mean(value, weight = hhwt))) %>% 
  ggplot(aes(x = factor(inc_dec), y = mean_value, fill = variable, group = variable))+
  geom_col(position="dodge")+
  scale_y_continuous(labels = comma, expand=c(0,0), limits = c(0,150000))+
  scale_x_discrete(expand = c(0.01, 0.01), labels = inc_dec_labels)+
  scale_fill_manual(values = PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.position = "none")+
  facet_wrap(~variable, labeller = consumption_labels)+
  theme(legend.position = "none",
        legend.title=element_blank(),
        strip.background = element_rect(fill = grey),
        strip.text = element_text(face = "bold"),
        panel.border = element_rect(colour = grey, fill = NA),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 90, hjust = 1),
        plot.margin=unit(c(1,1,1.5,1.2),"cm"))


##############
# Figure 3.20 - Gini coefficients for equivalised disposable income, private consumption and final consumption     (figrue 3.20)

# no final consumption available for HILDA

dat %>%
  select(year, hh_wt, eq_disp_inc, eq_cons) %>%
  mutate(eq_cons = ifelse(year<2006, 0.2, eq_cons)) %>%  # fudge so that ggplot works with na's
  gather(key = "variable", value = "value", c(-year, -hh_wt)) %>%
  mutate(value = ifelse(value >= 0, value, 0)) %>%                         # treat negative numbers as zeroes
  group_by(year, variable) %>%
  summarise(gini = gini(value, weights = hh_wt)) %>% 
ggplot(aes(x=year, y=gini, col=variable )) +
  geom_point(size=1) +
  geom_line(size=0.5) +
  scale_x_continuous(labels = wealth_fin_year_labels, breaks=seq(2002, 2015, 2), name="Year" ) +
  scale_y_continuous(labels=comma, name="Gini coefficient", limits = c(0, 0.4)) +
  scale_color_manual(values=PCcols)



# figure 3.21 - no in kind transfers in HILDA so no check on HES


# Figure 3.22 - no in kind transfers in HILDA so can't do final consumption deciles. Disposable income deciles in figure 3.17 above.



#######################################################################
# CHAPTER 4 - WEALTH
#######################################################################



#######
# Figure 4.1 - WEALTH MEANS AND MEDIANS - JOINT CHART WITH SIH -  'average wealth has steadily increased'.       (FIGURE 4.1)

SIH_wealth <-  readRDS(file= here("shared_dataframes", "SIH_wealth_mean_and_median.rds") )
SIH_wealth <- mutate(SIH_wealth, source="SIH",
                     year = year-1)

HILDA_wealth <- dat %>%
  select(year, eq_wealth, hh_wt) %>% 
  filter(year %in% wealth_years) %>% 
  group_by(year) %>% 
  summarise(mean = wtd.mean(eq_wealth, weights=hh_wt),
            median = wtd.quantile(eq_wealth, weights=hh_wt, probs=0.5) ) %>% 
  mutate(source="HILDA")

# emf(file = here("HILDA", "charts", "wealth_means_medians.emf"), width = 5.5, height = 3.14961)

bind_rows(SIH_wealth, HILDA_wealth) %>%
  rename(Average=mean, Median=median) %>%
  gather(key="measure", value="value", -year, -source) %>%
  ggplot(aes(x=year, y=value, col=source, shape=measure )) +
  geom_point(size=2) +
  geom_line(size=1.05) +   
  scale_x_continuous(expand = c(0.03, 0.03), labels = wealth_fin_year_labels, breaks=seq(2002, 2015, 2), name="Year") +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Dollars", limits=c(0,600000)) +
  scale_color_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none" )

# dev.off()


# calculate percentage changes - annualised

SIH_wealth %>%
  arrange(year) %>% 
  mutate(perc_growth_mean = ((mean/lag(mean,5))^(1/(year-lag(year,5))) - 1)*100,
         perc_growth_median = ((median/lag(median,5))^(1/(year-lag(year,5))) - 1)*100) 

HILDA_wealth %>%
  arrange(year) %>% 
  mutate(perc_growth_mean = ((mean/lag(mean,3))^(1/(year-lag(year,3))) - 1)*100,
         perc_growth_median = ((median/lag(median,3))^(1/(year-lag(year,3))) - 1)*100)


###########
# Figure 4.2 - average wealth by wealth deciles  check against HES                                           (figure 4.2)

dat %>% 
  select(year, hh_wt, wealth_dec, eq_wealth) %>% 
  filter(year %in% c(2002,2014)) %>% 
  group_by(year, wealth_dec) %>% 
  summarise(mean_value = wtd.mean(eq_wealth, hh_wt)) %>% 
  ggplot(aes(x = factor(wealth_dec), y = mean_value, fill = factor(year), group = year))+
  geom_col(position="dodge")+
  scale_y_continuous(labels = comma, expand = c(0, 0))+
  scale_x_discrete(expand = c(0.01, 0.01))+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))+
  scale_fill_manual(values = PCcols, labels = wealth_fin_year_labels) +
  labs(x = "Wealth decile", y = "Dollars", fill = "Year")

######
# explore fall in WEALTH in BOTTOM decile - absolute changes (home and other property on equity basis) - footnote comparing to HES

# explore numbers for bottom decile (not including eq_life_insurance)
dat %>% 
  filter(year==2014,
         wealth_dec==1) %>% 
  select(xwaveid, year, hh_wt, wealth_dec, eq_bank, eq_investments, eq_super, eq_home_equity, eq_other_property_equity, eq_business_equity,  eq_collect, eq_vehicles, 
         eq_other_personal_debt, eq_credit_card, eq_HECS ) %>%
  gather(asset_type, value,-xwaveid, -year, -hh_wt, -wealth_dec) %>% 
  group_by(asset_type, wealth_dec) %>% 
  summarise(group_total = wtd.mean(value, hh_wt) ) %>%
  View()

# changes
dat %>% 
  filter(year %in% wealth_years) %>% 
  filter(wealth_dec==1) %>% 
  select(xwaveid, year, hh_wt, eq_bank, eq_equity_and_cash, eq_trust_funds, eq_super, eq_life_insurance, eq_home_equity, eq_business_equity,
         eq_collect, eq_vehicles, eq_credit_card, eq_HECS, eq_other_personal_debt, eq_other_property_equity, eq_overdue_bills) %>%  # have excluded total wealth
  gather(asset_type, value,-xwaveid, -year, -hh_wt) %>% 
  group_by(asset_type, year) %>% 
  summarise(group_total = wtd.mean(value, hh_wt) ) %>%
  ungroup() %>% 
  arrange(asset_type, year) %>% 
  mutate(abs_change = (group_total - lag(group_total, 3) ) ) %>% 
  filter(year==2014) %>% 
  ggplot(aes(x=asset_type, y=abs_change))+
  geom_col(position="dodge") +
  theme(axis.text.x = element_text(angle = 90)) +
  scale_y_continuous(labels=comma) +
  PCcols_gradient

###########
# Figure 4.3 - percentage growth in wealth by wealth decile - matches broadly with HES results in wealth chapter. - annualised basis     (figure 4.3)
dat %>%
  select(eq_wealth, hh_wt, xwaveid, year, wealth_dec) %>% 
  filter(year %in% c(2002, 2014) ) %>% 
  group_by(year, wealth_dec) %>% 
  summarise(avg_wealth = wtd.mean(eq_wealth, weights = hh_wt) ) %>% 
  ungroup() %>% 
  arrange(wealth_dec, year) %>% 
  mutate(perc_growth = ((avg_wealth/lag(avg_wealth))^(1/(year-lag(year))) - 1)*100 ) %>% 
  filter(year==2014) %>% 
ggplot(aes(x=wealth_dec, y=perc_growth, fill=as.factor(wealth_dec) ) ) +
  geom_col(position ="dodge")+
  scale_y_continuous(name="Annual growth in wealth (per cent)") +
  scale_x_continuous(breaks=seq(1,10,1), name="Wealth decile")+
  theme(legend.position = "bottom",legend.title=element_blank() )


########
# consumption ginis - joint chart with HES for presentation on release

HES_cons_gini <-  readRDS(file= here("shared_dataframes", "HES_ginis.rds") )

HES_cons_gini <- HES_cons_gini %>% 
  ungroup() %>%
  mutate(source="HES") %>% 
  filter(variable == "eq_cons" | variable == "eq_cons_no_inkind" ) %>%
  filter(year>1990) 
  
HILDA_cons_gini <- dat %>%
  select(year, eq_cons, hh_wt) %>% 
  filter(year >= 2006) %>% 
  group_by(year) %>% 
  summarise(gini = gini(eq_cons, hh_wt) ) %>% 
  mutate(variable ="HILDA")

# consumption ginis chart for presentation on dark background
# emf(file = here("HILDA", "charts", "cons_ginis_presentation.emf"), width = 5.90551, height = 3.14961)

bind_rows(HES_cons_gini, HILDA_cons_gini) %>% 
  ggplot(aes(x=year, y=gini, col=variable )) +
  # geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = inc_fin_year_labels, breaks=seq(1994, 2016, 3), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0.0, 0.4)) +
  PC.theme.line() +
  scale_color_manual(values=c(yellow, blue, PC_green))+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none",
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))

# dev.off()


#######
# Figure 4.4. - WEALTH GINIS - JOINT CHART WITH SIH                                (figure 4.4)


SIH_wealth_gini <- readRDS(file= here("shared_dataframes", "SIH_wealth_ginis.rds") )
names(SIH_wealth_gini)[1] <- "year" 
names(SIH_wealth_gini) [2] <- "gini"
SIH_wealth_gini <- mutate(SIH_wealth_gini, source="SIH",
                          year = year-1)   # because years are out of sync in HES v HILDA
SIH_wealth_gini <-  filter(SIH_wealth_gini, year>2002,
                           year!=2007)


HILDA_wealth_gini <- dat %>%
  select(year, eq_wealth, hh_wt) %>% 
  filter(year %in% wealth_years) %>% 
  group_by(year) %>% 
  summarise(gini = gini(eq_wealth, hh_wt) ) %>% 
  mutate(source="HILDA")


# income get all together
HILDA_income_gini <- dat %>%
  select(year, eq_disp_inc, hh_wt) %>% 
  group_by(year) %>% 
  summarise(gini = gini(eq_disp_inc, hh_wt) ) %>% 
  mutate(source="HILDA", variable = "eq_disp_inc")

SIHHES_income_gini <- readRDS(file= here("shared_dataframes", "SIHHES_income_ginis.rds") ) %>% 
  select(-c(eq_disp_incp, eq_yearly_disp_inc) )

SIHHES_HILDA_income_ginis <- bind_rows(SIHHES_income_gini, HILDA_income_gini)


SIHHES_HILDA_income_ginis <- SIHHES_HILDA_income_ginis %>% 
  ungroup() %>% 
  mutate(year=year-1) # to match transformed wealth ginis ####################### need to be careful if using this for other stuff.


# wealth get all together
SIH_HILDA_wealth_ginis <- bind_rows(SIH_wealth_gini, HILDA_wealth_gini) 

SIH_HILDA_wealth_ginis <- SIH_HILDA_wealth_ginis %>% 
  mutate(variable = "eq_wealth")

# income and wealth all together

SIHHES_HILDA_income_wealth_ginis <- bind_rows(SIHHES_HILDA_income_ginis, SIH_HILDA_wealth_ginis)


# wealth ginis chart (SIH and HILDA) - figure 4.4

# emf(file = here("HILDA", "charts", "wealth_ginis.emf"), width = 5.90551, height = 3.14961)

bind_rows(SIH_wealth_gini, HILDA_wealth_gini) %>% 
  ggplot(aes(x=year, y=gini, col=source )) +
  geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = wealth_fin_year_labels, breaks=seq(2002, 2015, 2), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0.5, 0.65)) +
  scale_color_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none")
        

# dev.off()


# same as above but for presentation on dark background, 

# emf(file = here("HILDA", "charts", "wealth_ginis_presentation_full.emf"), width = 5.90551, height = 3.14961)

bind_rows(SIH_wealth_gini, HILDA_wealth_gini) %>% 
  ggplot(aes(x=year, y=gini, col=source )) +
 # geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = wealth_fin_year_labels, breaks=seq(2002, 2015, 2), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0.5, 0.65)) +
  scale_color_manual(values=PCcolsPresRev)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none",
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))


# dev.off()





# same as above but for presentation on dark background, half size, no points
# emf(file = here("HILDA", "charts", "wealth_ginis_presentation_half.emf"), width = 2.95, height = 3.14961)

bind_rows(SIH_wealth_gini, HILDA_wealth_gini) %>% 
  ggplot(aes(x=year, y=gini, col=source )) +
#  geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = short_wealth_fin_year_labels, breaks=c(1988, 1993, 1998, 2003, 2009, 2015), limits=c(1988, 2015), name="Year" ) +  # breaks=seq(2002, 2015, 2), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0.2, 0.7)) +
  scale_color_manual(values=PCcolsPresRev)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none",
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))

# dev.off()



# same as above but for presentation on dark background, full size, no points
# emf(file = here("HILDA", "charts", "wealth_ginis_presentation_ful.emf"), width = 2.95, height = 3.14961)

bind_rows(SIH_wealth_gini, HILDA_wealth_gini) %>% 
  ggplot(aes(x=year, y=gini, col=source )) +
  #  geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = short_wealth_fin_year_labels, breaks=c(1988, 1993, 1998, 2003, 2009, 2015), limits=c(1988, 2015), name="Year" ) +  # breaks=seq(2002, 2015, 2), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0.2, 0.7)) +
  scale_color_manual(values=PCcolsPresRev)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none",
        axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))

# dev.off()


# income and wealth, HES/SIH and HILDA - all combined for Press Club Presentation.

# svg(file = here("HILDA", "charts", "SIHHES_HILDA_income_wealth_ginis.svg"))
# emf(file = here("HILDA", "charts", "SIHHES_HILDA_income_wealth_ginis.emf"), width = 5.90551, height = 3.14961)

# chart
SIHHES_HILDA_income_wealth_ginis %>% 
  ggplot(aes(x=year, y=gini, col=source, group=interaction(source, variable)  ) ) +
#  geom_point(size=2) +
  geom_line(size=1.05) +
  scale_x_continuous(expand = c(0.03, 0.03), labels = wealth_fin_year_labels, breaks=seq(1988, 2015, 3), name="Year" ) +
  scale_y_continuous(expand = c(0, 0), labels=comma, name="Gini coefficient", limits = c(0, 0.7), breaks = seq(0,0.7,0.1)) +
  scale_color_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"), 
        legend.position = "none")

# dev.off()







#########################################
# Figure 4.5 wealth chapter - percentage of wealth held by top 10% - checking SIH
#######################################


dat %>% 
  select(wealth_dec, hhwt, eq_wealth, year) %>% 
  group_by(year, wealth_dec) %>% 
  summarise(group_total = sum(eq_wealth*hhwt)) %>% 
  ungroup() %>% 
  group_by(year) %>% 
  mutate(perc = 100*group_total/sum(group_total)) %>% 
  arrange(wealth_dec) %>% 
  mutate(culm = cumsum(perc) ) %>% 
  mutate(perc = ifelse(wealth_dec!=10, culm, perc) ) %>% 
  filter(wealth_dec %in% c(5,10)) %>% 
  ggplot(aes(x=year, y=perc, col=as.factor(wealth_dec) ) ) + 
  geom_line(size = 1.05) +
  geom_point(size = 2) +
  scale_x_continuous(expand=c(0.03, 0.03), labels = wealth_fin_year_labels, breaks=wealth_years, name="Year") +
  scale_y_continuous(expand = c(0,0), labels=comma, limits=c(0,50), breaks=seq(0,50, 5),name="Per cent") +
  scale_color_manual(values=PCcols)+
  theme(legend.position = "none", plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))




##############
# Figure 4.6 - OECD means and medians 2017 (USD PPP)              (figure 4.6)


OECD_wealth <- read.csv(here("wealth means and medians OECD 2017.csv"), header=TRUE) 
OECD_wealth <- tbl_df(OECD_wealth)

# emf(file = here("HILDA", "charts", "OECD_wealth_means_and_medians.emf"), width = 5.90551, height = 3.14961)

OECD_wealth %>% 
  mutate(highlight = ifelse(Country=='Australia', 1, 0)) %>% 
  arrange(Average) %>%
  mutate(one = 1, counting = cumsum(one)) %>% 
  ggplot() +
  geom_col(aes(x = reorder(Country, desc(counting)), y=Average, fill=as.factor(highlight)), position="dodge") +
  geom_point(aes( x = reorder(Country, desc(counting)),  y=Median, color = as.factor(highlight)), size=3, shape=18) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.2), legend.position='none', plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))+
  scale_fill_manual(values=PCcols)+
  scale_color_manual(values=c(black,grey))+
  scale_y_continuous(name="US Dollars (PPP)", labels=comma, expand = c(0, 0)) +
  xlab("Country") 

# dev.off()


# white writing - to go in presentations on dark background, just average

# emf(file = here("HILDA", "charts", "OECD_wealth_means_and_medians_presentation.emf"), width = 5.90551, height = 3.14961)

OECD_wealth %>% 
  mutate(highlight = ifelse(Country=='Australia', 1, 0)) %>% 
  arrange(Average) %>%
  mutate(one = 1, counting = cumsum(one)) %>% 
  ggplot() +
  geom_col(aes(x = reorder(Country, desc(counting)), y=Average, fill=as.factor(highlight)), position="dodge") +
#  geom_point(aes( x = reorder(Country, desc(counting)),  y=Median, color = as.factor(highlight)), size=3, shape=18) +
  theme(        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.2, colour=white), 
  #             plot.background = element_rect(fill = dark_grey), 
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white),
        legend.position='none', plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))+
  scale_fill_manual(values=PCcols)+
  scale_color_manual(values=c(black,grey))+
  scale_y_continuous(name="US Dollars (PPP)", labels=comma, expand = c(0, 0)) +
  xlab("Country") 

# dev.off()




###############
# Figure 4.7 - Credit Suisse ginis of wealth         (figure 4.7)

# read in data
int_ginis <- read.csv(here("credit suisse wealth ginis 2017.csv"), header=TRUE) 
int_ginis <- tbl_df(int_ginis)

# emf(file = here("HILDA", "charts", "international_wealth_ginis.emf"), width = 5.90551, height = 3.14961)

int_ginis %>% 
  ggplot(aes(x = reorder(country, desc(gini)), y=gini, col=as.factor(highlight)  ) ) +
  geom_point(size=3) +
  geom_segment(aes(xend=reorder(country, desc(gini)), y=0, yend=gini))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.2), legend.position='none', plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))+
  scale_color_manual(values=PCcols)+
  scale_y_continuous(limits = c(0,1), expand = c(0,0))+
  xlab("Country") +
  ylab("Gini coefficient")

# dev.off()


# white writing - to go in presentations on dark background

# emf(file = here("HILDA", "charts", "international_wealth_ginis_presentation.emf"), width = 5.90551, height = 3.14961)

int_ginis %>% 
  ggplot(aes(x = reorder(country, desc(gini)), y=gini, col=as.factor(highlight)  ) ) +
  geom_point(size=3) +
  geom_segment(aes(xend=reorder(country, desc(gini)), y=0, yend=gini))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.2, colour=white), 
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white),
                legend.position='none', plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"))+
  scale_color_manual(values=PCcolsPres)+
  scale_y_continuous(limits = c(0,1), expand = c(0,0))+
  xlab("Country") +
  ylab("Gini coefficient")

# dev.off()




##########################################################################
# Section 4.2 - Components of wealth

###########
# (in text) percentage of unequivalised wealth in owner occupied housing and super. (HILDA checking HES)
dat %>%
  filter(year==2014) %>% 
  select(hh_wt, hh_home_equity, hh_super, hh_wealth) %>% 
  summarise(perc_home = (sum(hh_wt*hh_home_equity))/(sum(hh_wt*hh_wealth)),
            perc_super = (sum(hh_wt*hh_super))/(sum(hh_wt*hh_wealth)))


################
# Figure 4.8 - Average wealth by wealth typea, for each wealth decile - In dollars (top panel) and as a share of average total wealth (bottom panel)    (figure 4.8)

dat %>% 
  filter(year==2014) %>% 
  select(year, hhwt, wealth_dec, eq_home_equity,  eq_super, eq_other_wealth) %>%
  gather(asset_type, value, -year, -hhwt, -wealth_dec) %>% 
  group_by(asset_type, wealth_dec) %>% 
  summarise(group_total = wtd.mean(value, hhwt) ) %>%
  ggplot(aes(x=as.factor(wealth_dec), y=group_total, fill = asset_type, group = asset_type))+
  geom_col(position="stack") +
  scale_y_continuous(expand=c(0,0), label=comma)+
  scale_x_discrete(expand=c(0.01, 0.01))+
  scale_fill_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        axis.title.y = element_blank(),
        legend.position = "right",
        legend.title.align = 0.5,
        legend.title = element_text(face = "bold")) +
  labs(x = "Wealth decile", fill = "Wealth type")


dat %>% 
  filter(year==2014) %>% 
  select(year, hhwt, wealth_dec, eq_home_equity, eq_other_property_equity, eq_super, eq_business_equity,  eq_financial_wealth, eq_vehicles,  eq_personal_wealth) %>%
  gather(asset_type, value, -year, -hhwt, -wealth_dec) %>% 
  group_by(asset_type, wealth_dec) %>% 
  summarise(group_total = wtd.mean(value, hhwt) ) %>%
  ggplot(aes(x=as.factor(wealth_dec), y=group_total, fill = asset_type, group = asset_type))+
  geom_col(position="fill") +
  scale_y_continuous(expand=c(0,0), label=comma)+
  scale_x_discrete(expand=c(0.01, 0.01))+
  scale_fill_manual(values=PCcols)+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        axis.title.y = element_blank(),
        legend.position = "right",
        legend.title.align = 0.5,
        legend.title = element_text(face = "bold")) +
  labs(x = "Wealth decile", fill = "Wealth type")

###################
#Figure 4.9 - Dollar change in wealth by wealth decile and wealth type, 2003-04 to 2015-16 (figure 4.9) - housing and super have driven growth in hosuehold wealth
# emf(file = here("HES","Charts","dollar_change_in_wealth_type_by_wealth_dec.emf"), width = 5.90551, height = 3.14961)

dat %>% 
  select(year, hhwt, wealth_dec, eq_home_equity, eq_other_property_equity, eq_super, eq_business_equity,  eq_financial_wealth, eq_vehicles,  eq_personal_wealth) %>%
  filter(year==2014 | year==2002) %>%
  gather(wealth_type,value,c(-year,-wealth_dec, -hhwt)) %>%
  group_by(year, wealth_type, wealth_dec) %>%
  summarise(mean_value=wtd.mean(value, weight = hhwt)) %>%
  ungroup() %>%
  arrange(wealth_dec,wealth_type,year) %>%
  mutate(diff=mean_value-lag(mean_value,1)) %>%
  filter(year==2014) %>%
  ggplot(aes(x = wealth_type, 
             y = diff,
             group = wealth_dec,
             fill = wealth_dec)) +
  geom_col(position="dodge")+
  scale_y_continuous(labels = comma, expand=c(0.01,0.01), limits = c(-100000, 275000), breaks = seq(-100000,250000,50000))+
  scale_x_discrete(expand = c(0.01, 0.01))+
  labs(x = "Type of wealth", fill = "Wealth decile:")+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text(),
        axis.line.x = element_blank(),
        axis.title.y = element_blank())+
  geom_hline(yintercept = 0, colour = grey)+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))

# dev.off()

####################################################################
# SECTION 4.3


#########
# Figure 4.10 - income and wealth deciles JOINT DISTRIBUTION (2014-15 for comparison, most recetn HILDA wealth year)      (Figure 4.10)       


inc_wealth_prop <- as.data.frame(prop(wtd.table(dat$inc_dec[dat$year == 2014], 
                                                dat$wealth_dec[dat$year == 2014],
                                                dat$hhwt[dat$year == 2014])))

colnames(inc_wealth_prop) <- c("inc_dec", "wealth_dec", "per_cent")

inc_wealth_prop <- inc_wealth_prop %>% filter(inc_dec!="Total", wealth_dec!="Total")


inc_wealth_prop %>% 
  ggplot(aes(x = factor(wealth_dec), 
             y = per_cent,
             fill = as.numeric(inc_dec),
             group = factor(inc_dec))) +
  geom_col(position = "dodge")+
  scale_y_continuous(expand=c(0,0))+
  scale_x_discrete(expand=c(0.1, 0.01))+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.position = c(0.5,0.9),
        legend.title.align = 0.5,
        legend.title = element_text(face = "bold")) +
  labs(x = "Wealth decile", y = "Per cent", fill = "Disposable income decile")+
  PCcols_gradient+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))+
  geom_hline(yintercept = 1, colour = grey, size = 0.5, linetype = "dashed")




#################
# Figure 4.11 (a) INCOME AND PRIVATE (not final) CONSUMPTION MEANS by wealth decile (have to do wealth separately because of scale)           - (Figure 4.11 (a) and (b))

dat %>% 
  filter(year==2014) %>% 
  select(hh_wt, wealth_dec, eq_disp_inc, eq_cons) %>%    
  gather(key="measure", value="value", -hh_wt, -wealth_dec) %>% 
  group_by(wealth_dec, measure) %>% 
  summarise(wtd_avg = wtd.mean(value, hh_wt)) %>% 
  ggplot(aes(x=wealth_dec,y=wtd_avg,group=factor(measure),fill=factor(measure))) +
  geom_col(position ="dodge")+
  facet_grid(. ~ measure) +
  scale_y_continuous(labels=comma,name="average income/consumption", breaks=seq(0,250000,10000))+
  scale_x_continuous(breaks=seq(1,10,1), name="Wealth decile")+
  theme(legend.position = "bottom",legend.title=element_blank()) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.position = "none",
        axis.title.y=element_blank()) +
  scale_fill_manual(values = PCcols)

# Figure 4.11 (b)- wealth
dat %>% 
  filter(year==2014) %>% 
  select(hh_wt, wealth_dec, eq_wealth) %>%    
  gather(key="measure", value="value", -hh_wt, -wealth_dec) %>% 
  group_by(wealth_dec, measure) %>% 
  summarise(wtd_avg = wtd.mean(value, hh_wt)) %>% 
  ggplot(aes(x=wealth_dec,y=wtd_avg,group=factor(measure),fill=factor(measure))) +
  geom_col(position ="dodge")+
  facet_grid(. ~ measure) +
  scale_y_continuous(labels=comma,name="average income/consumption", breaks=seq(0,2500000,500000))+
  scale_x_continuous(breaks=seq(1,10,1), name="Wealth decile")+
  theme(legend.position = "bottom",legend.title=element_blank()) +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.position = "none",
        axis.title.y=element_blank()) +
  scale_fill_manual(values = PCcols)


##############
# Figure 4.12 (a and b) - share of age group in each income and wealth decile

# Figure 4.12 (a)  wealth deciles
dat %>% 
  filter(year==2014) %>% 
  group_by(wealth_dec, age_group_string) %>% 
  summarise(pop = sum(hhwt)) %>% 
  ungroup() %>% 
  group_by(age_group_string) %>% 
  mutate(percent = pop/sum(pop)) %>% 
  ggplot(aes(x=factor(age_group_string), y=percent*100, fill=wealth_dec, group=wealth_dec ) ) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1))+
  scale_y_continuous(expand=c(0,0), limits = c(0,25))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Age group", y = "Per cent", fill = "Wealth decile")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))
  
  

# Figure 4.12 (b) income deciles
dat %>% 
  filter(year==2016) %>% 
  group_by(inc_dec, age_group_string) %>% 
  summarise(pop = sum(hhwt)) %>% 
  ungroup() %>% 
  group_by(age_group_string) %>% 
  mutate(percent = pop/sum(pop)) %>% 
  ggplot(aes(x=factor(age_group_string), y=percent*100, fill=inc_dec, group=inc_dec ) ) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1))+
  scale_y_continuous(expand=c(0,0), limits = c(0,25))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Age group", y = "Per cent", fill = "Income decile")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))



# Figure 4.13 - Average equivalised wealth by age and birth decade - uses HILDA longitudinal


############
# Figure 4.14 (a) - group by household type and show proportion in each decile - WEALTH     -      Figure 4.14 (a)

dat %>%
  filter(year==2014) %>%
  group_by(wealth_dec, household_type) %>%
  summarise(pop = sum(hhwt)) %>%
  ungroup() %>%  complete(wealth_dec,household_type) %>% 
  group_by(household_type) %>%
  mutate(percent = pop/sum(pop, na.rm=TRUE)) %>% 
  ggplot(aes(x = household_type, y=percent*100, fill=wealth_dec, group=wealth_dec) ) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1), labels = household_type_labels_spaced)+
  scale_y_continuous(expand=c(0,0))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Household type", y = "Per cent", fill = "Wealth decile:")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")


# Figure 4.14 (b) - group by household type and show proportion in each decile - INCOME   -        Figure 4.14 (b)

dat %>%
  group_by(year, inc_dec, household_type) %>%
  summarise(pop = sum(hhwt)) %>%
  ungroup() %>%
  group_by(year, household_type) %>%
  mutate(percent = pop/sum(pop)) %>%
  filter(year==2014) %>% 
  ggplot(aes(x = household_type, y=percent*100, fill=inc_dec, group=inc_dec)) +
  # geom_hline(yintercept = 10)+
  geom_col(position = "dodge")+
  scale_x_discrete(expand = c(0.1, 0.1), labels = household_type_labels_spaced)+
  scale_y_continuous(expand=c(0,0), breaks = seq(0,60,10))+
  PCcols_gradient+
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm"),
        legend.title = element_text()) +
  labs(x = "Household type", y = "Per cent", fill = "Income decile:")+
  geom_hline(yintercept = 10, colour = grey, size = 0.5, linetype = "dashed")+
  guides(fill=guide_legend(nrow=1, byrow=TRUE))


# check HES result in HILDA (in text)
# Retiree households who receive pension payments typically have low incomes, although many have substantial wealth. More than half of this wealth is in the form of owner occupied housing 

dat %>% 
  filter(year==2014) %>% 
  select(household_type, eq_wealth, eq_home_equity, hhwt) %>% 
  group_by(household_type) %>% 
  summarise(avg_wealth = wtd.mean(eq_wealth, hhwt) ,
            avg_home_wealth = wtd.mean(eq_home_equity, hhwt)) %>%
  mutate(percent = avg_home_wealth/avg_wealth) 



##############################################
# Chapter 5 - MOBILITY
############################################


########
# earnings elasticities - grouped by son's position (figure 5.2)

earning <- read.csv(here("shared_dataframes", "earnings_elasticities.csv"), header=TRUE) 

earning <- earning %>%
  gather(key="son", value="probability", -dad_percentile)

# emf(file = here("HILDA", "charts", "earnings_elasticities.emf"), width = 5.90551, height = 3.14961)

earning %>% 
  ggplot(aes(x=son, y=probability, fill=dad_percentile)) + 
  geom_col(position="dodge") +
  theme(  legend.title = element_text("Father's earnings percentile" ), legend.position = "top" ) +
  scale_y_continuous(name="Probability", breaks=seq(0,70,10), expand = c(0.0001, 0.0001))+
  scale_x_discrete(name="Son's position in earnings distribution", labels=son_position_labels, expand = c(0.02, 0.02)) +
  scale_fill_manual(values= PCcols,labels=father_percentile_labels, name="Father's earnings percentile:") +
  theme(plot.margin = unit(c(0.13,0.25,0.13,0.25), "cm") )

# dev.off()

##########
# median deciles by age - median person (figure 5.7)

# create joint dataframe with medians for plotting as facet with median person's decile for income, wealth and consumption
med_dec <- dat %>%
  filter(age<90) %>% 
  group_by(year, age) %>%
  summarise(med_inc_dec = wtd.quantile(inc_dec, weights = hh_wt, probs=0.5) ) %>%
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_inc = median(med_inc_dec)) 

med_dec_wealth <- dat %>%
  filter(year %in% wealth_years) %>% 
  filter(age<90) %>% 
  group_by(year, age) %>%
  summarise(med_wealth_dec = wtd.quantile(wealth_dec, weights = hh_wt, probs=0.5) )%>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_wealth = median(med_wealth_dec)) 
# join to other dataframe and remove
med_dec <- left_join(med_dec, med_dec_wealth, by="age")
rm(med_dec_wealth) 

med_dec_cons <- dat_cons %>%
  filter(age<90) %>%
  group_by(age) %>%
  summarise(med_cons_dec = wtd.quantile(cons_dec, weights = hh_wt, probs=0.5) ) %>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_cons = median(med_cons_dec)) 
# join to other dataframe and remove
med_dec <- left_join(med_dec, med_dec_cons, by="age")
rm(med_dec_cons)


## income

# emf(file = here("HILDA", "charts", "med_dec_inc.emf"), width = 5.90551, height = 2.3)

dat %>%
  filter(age >= 15 & age<=85) %>% 
  group_by(year, age) %>%
  summarise(med_inc_dec = wtd.quantile(inc_dec, weights = hh_wt, probs=0.5) ) %>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_inc = median(med_inc_dec)) %>% 
  ggplot(aes(x=age, y=middle_inc)) +
  geom_point(col=blue) +
  scale_y_continuous(breaks = seq(0,10,2), limits = c(0,10), name="Median income decile") +
  scale_x_continuous(breaks = seq(0,90,10), limits = c(15,85), name="Age" )

# dev.off()


## income for presentation on a dark background

# emf(file = here("HILDA", "charts", "med_dec_inc_presentation.emf"), width = 5.90551, height = 2.3)

dat %>%
  filter(age >= 15 & age<=85) %>% 
  group_by(year, age) %>%
  summarise(med_inc_dec = wtd.quantile(inc_dec, weights = hh_wt, probs=0.5) ) %>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_inc = median(med_inc_dec)) %>% 
  ggplot(aes(x=age, y=middle_inc)) +
  geom_point(col=blue) +
  scale_y_continuous(breaks = seq(0,10,2), limits = c(0,10), name="Median income decile") +
  scale_x_continuous(breaks = seq(0,90,10), limits = c(15,85), name="Age" ) +
  theme(axis.text.x = element_text(colour=white),
        axis.text.y = element_text(colour=white), 
        axis.title.x = element_text(colour=white),   
        axis.title.y = element_text(colour=white))

# dev.off()




### wealth

# emf(file = here("HILDA", "charts", "med_dec_wealth.emf"), width = 5.90551, height = 2.3)

dat %>%
  filter(year %in% wealth_years) %>% 
  filter(age>=15 & age<=85) %>% 
  group_by(year, age) %>%
  summarise(med_wealth_dec = wtd.quantile(wealth_dec, weights = hh_wt, probs=0.5) )%>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_wealth = median(med_wealth_dec))  %>% 
  ggplot(aes(x=age, y=middle_wealth)) +
  geom_point(col=dark_blue) +
  scale_y_continuous(breaks = seq(0,10,2), limits = c(0,10), name="Median wealth decile") +
  scale_x_continuous(breaks = seq(0,90,10), limits = c(15,85), name="Age" )

# dev.off()

# wealth - PC green and white writing for presentation on a dark background

# emf(file = here("HILDA", "charts", "med_dec_wealth_presentation.emf"), width = 5.90551, height = 2.3)

dat %>%
  filter(year %in% wealth_years) %>% 
  filter(age>=15 & age<=85) %>% 
  group_by(year, age) %>%
  summarise(med_wealth_dec = wtd.quantile(wealth_dec, weights = hh_wt, probs=0.5) )%>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_wealth = median(med_wealth_dec))  %>% 
  ggplot(aes(x=age, y=middle_wealth)) +
  geom_point(col=PC_green) +
  scale_y_continuous(breaks = seq(0,10,2), limits = c(0,10), name="Median wealth decile") +
  scale_x_continuous(breaks = seq(0,90,10), limits = c(15,85), name="Age" ) +
  theme(axis.text.x = element_text(colour=white),
axis.text.y = element_text(colour=white), 
axis.title.x = element_text(colour=white),   
axis.title.y = element_text(colour=white))

# dev.off()



## consumption

# emf(file = here("HILDA", "charts", "med_dec_cons.emf"), width = 5.90551, height = 2.3)

dat_cons %>%
  filter(age>=15 & age<=85) %>% 
  group_by(year, age) %>%
  summarise(med_cons_dec = wtd.quantile(cons_dec, weights = hh_wt, probs=0.5) ) %>% 
  ungroup() %>% 
  group_by(age) %>% 
  summarise(middle_cons = median(med_cons_dec)) %>% 
  ggplot(aes(x=age, y=middle_cons)) +
  geom_point(col=PC_green) +
  scale_y_continuous(breaks = seq(0,10,2), limits = c(0,10), name="Median consumption decile") +
  scale_x_continuous(breaks = seq(0,90,10), limits = c(15,85), name="Age" )

# dev.off()



#######################################################################
# CHAPTER 6 - POVERTY
#######################################################################

########## Need to do this stuff first for poverty

##Indicator variable for Headey definition of financial poverty (using liquid assets less than 25% of poverty lines as filter)
dat <- dat %>% 
  group_by(year) %>%
  mutate(fin_poverty_Headey_liquid_assets = ifelse(year %in% wealth_years & year %in% cons_years,
                                                   ifelse(eq_disp_inc < (0.5 * weighted.quantile(eq_disp_inc, w = hhwt, probs = 0.5)) &
                                                            eq_cons < (0.5 * weighted.quantile(eq_cons, w = hhwt, probs = 0.5)) &
                                                            eq_liquid_assets < (0.25 * 0.5 * weighted.quantile(eq_disp_inc, w = hhwt, probs = 0.5)), 1 ,0),
                                                   NA) ) 

# Consumption poverty (no in kind transfers in HILDA)
dat <- dat %>% 
  group_by(year) %>%
  mutate(cons_no_inkind_poverty = ifelse(year %in% cons_years,
                                         ifelse(eq_cons < (0.5 * wtd.quantile(eq_cons, w = hhwt, probs = 0.5) ), 1 ,0),
                                         NA) ) 

#Variable for half of median cons (no in kind in HILDA)
dat <- dat %>% 
  group_by(year) %>%
  mutate(cons_no_inkind_poverty_line = ifelse(year %in% cons_years, 0.5 * wtd.quantile(eq_cons, w = hhwt, probs=0.5),
                                    NA))

########### Can go bit-by-bit from here (in poverty section)




###################
# Figure 6.1 - Plot of percentage in poverty by year (figure 6.1)

#Per cent in poverty - check against HES ################ (note that consumption and wealth are for current year, but income is for previous financial year)
perc_poverty <- dat %>%
  group_by(year) %>%
  summarise(income = weighted.mean(inc_poverty, w = hhwt),
            cons_no_in_kind = weighted.mean(cons_no_inkind_poverty, w = hhwt),   # can't do final consumption in HILDA (no in kind)
            financial_Headey_liquid_assets = weighted.mean(fin_poverty_Headey_liquid_assets, w = hhwt))

perc_poverty %>%
  gather(key = "variable", value = "percent",-year) %>%
  filter(variable == "income" | variable == "cons_no_in_kind" | variable == "financial_Headey_liquid_assets") %>%
  ggplot(aes(x=year,y=percent*100,col=variable)) +
  geom_line(size=1)+
  geom_point(size=3, shape=16)+
  scale_y_continuous(name="Per cent")+
 scale_x_continuous(name = "", breaks = HES_years, labels = inc_fin_year_labels)+
  scale_color_manual(values=PCcols, labels = poverty_labels)


# figure 6.2 - can't match anchored poverty in HILDA as doesn't go back far enough 


#################
# Figure 6.3 - Average relative povery gaps  (dollars and per cent)
#  - to check against HES  (note that consumption and wealth are for current year, but income is for previous financial year)

#Variable for half of median disposable income
dat <- dat %>% 
  group_by(year) %>%
  mutate(inc_poverty_line = 0.5 * wtd.quantile(eq_disp_inc, w = hhwt, probs=0.5))

#Variable for half of median consumption (no inkind) - not possible prior to 2006 (no consumption data)
dat <- dat %>% 
  group_by(year) %>%
  mutate(cons_no_inkind_poverty_line = ifelse(year<2006,-99999, 0.5 * wtd.quantile(eq_cons, w = hhwt, probs=0.5) ) )

#Mean poverty gap (dollars and percent)
mean_poverty_gap_dollars <- dat %>%
  group_by(year) %>%
  summarise(income = weighted.mean(inc_poverty_line - eq_disp_inc, w = hhwt * inc_poverty),
            cons_no_in_kind = weighted.mean(cons_no_inkind_poverty_line - eq_cons, w = hhwt * cons_no_inkind_poverty))

mean_poverty_gap_percent <- dat %>%
  group_by(year) %>%
  summarise(income = weighted.mean((inc_poverty_line - eq_disp_inc) / inc_poverty_line, w = hhwt * inc_poverty),
            cons_no_in_kind = weighted.mean((cons_no_inkind_poverty_line - eq_cons) / cons_no_inkind_poverty_line, w = hhwt * cons_no_inkind_poverty))

# Plot of mean poverty gaps by year (dollars) (figure 6.3 a)  

mean_poverty_gap_dollars %>%
  gather(key = "variable", value = "value",-year) %>%
  ggplot(aes(x=year,y=value,col=variable)) +
  geom_line(size=1)+
  geom_point(size=3, shape=16)+
  scale_y_continuous(name="$ (2016-17)", labels = comma, limits = c(9,10000))+
 scale_x_continuous(name = "", breaks = HES_years, labels = inc_fin_year_labels)+
  scale_color_manual(values=PCcols)


#Plot of mean poverty gaps by year (percent) (figure 6.3 b)

mean_poverty_gap_percent %>%
  gather(key = "variable", value = "percent",-year) %>%
  ggplot(aes(x=year,y=percent*100,col=variable)) +
  geom_line(size=1)+
  geom_point(size=3, shape=16)+
  scale_y_continuous(name="Per cent", limits = c(0,80))+
  scale_x_continuous(name = "", breaks = HES_years, labels = inc_fin_year_labels)+
  scale_color_manual(values=PCcols)


# Figure 6.4 - This chart done with HILDA data in excel



################
# Figure 6.5 and 6.6

#Per cent in poverty by household type

perc_poverty_by_household <- dat %>%
  group_by(year, household_type_poverty) %>%
  summarise(income = weighted.mean(inc_poverty, w = hhwt),
            cons_no_inkind = weighted.mean(cons_no_inkind_poverty, w = hhwt))

# Figure 6.5
perc_poverty_by_household %>% 
  filter(year==2016) %>% 
  gather(key="variable", value="percent", -household_type_poverty, -year) %>% 
  ggplot(aes(x=household_type_poverty, y=percent, fill=variable)) +
  geom_col(position="dodge")

# Figure 6.6 - panel a
perc_poverty_by_household %>% 
  filter(year %in% cons_years) %>% 
  ggplot(aes(x=year, y=cons_no_inkind, col=household_type_poverty) ) +
  geom_line() +
  geom_point()

# Figure 6.6 - panel b
perc_poverty_by_household %>% 
  ggplot(aes(x=year, y=income, col=household_type_poverty) ) +
  geom_line() +
  geom_point()



####################
# Figure 6.7 - income poverty rates by age group

# Per cent in poverty by age group

inc_poverty_by_age_group <- dat %>%
  filter(year %in% HES_years) %>% 
  group_by(year, age_group) %>%
  summarise(HILDA_inc_poverty_rate = sum(inc_poverty*hhwt) / sum(hhwt) ) %>%
  arrange(age_group) 


inc_poverty_by_age_group %>%
  ggplot(aes(x=year, y=HILDA_inc_poverty_rate, col=age_group)) +
  geom_line() +
  geom_point()


####################
# Figure 6.8 - consumption poverty rates by age group

cons_no_inkind_poverty_by_age_group <- dat %>%
  filter(year %in% HES_years & year>=2006) %>% 
  group_by(year, age_group) %>%
  summarise(cons_no_inkind_poverty_rate = sum(cons_no_inkind_poverty*hhwt) / sum(hhwt) ) %>%
  arrange(age_group) 

cons_no_inkind_poverty_by_age_group %>% 
  filter(year %in% cons_years) %>% 
  ggplot(aes(x=year, y=cons_no_inkind_poverty_rate, col=age_group)) +
  geom_line() +
  geom_point()


###################################################################
# MATERIAL DEPRIVATION (IN POVERTY CHAPTER)
################################################################


# 1. Filter to include only the 2014 wave #
# 2. select only relevant variables, turn factor variables into integers and adjust to base 0 #

mat_deprivation <- dat %>%
  filter(year==2014) %>% 
  dplyr::select(hh_id, hh_wt, rp_wt, inc_poverty, starts_with("mdha")) 

mat_deprivation <- mat_deprivation %>%
  mutate(mdschoolclothes = as.integer(mdhacbk)-10,
         mdchilddentist = as.integer(mdhacdc)-10,
         mdchildhobby = as.integer(mdhacho)-10,
         mdchildbeds = as.integer(mdhacsb)-10,
         mdchildschooltrip = as.integer(mdhacst)-10,
         mddentist = as.integer(mdhadt)-10,
         mdfurniture = as.integer(mdhafu)-10,
         mdcatchup = as.integer(mdhagt)-10,
         mdheating = as.integer(mdhahaw)-10,
         mdhcinsurance = as.integer(mdhahci)-10,
         mdmedical = as.integer(mdhamt)-10,
         mdvehicle = as.integer(mdhamv)-10,
         mdcarinsure = as.integer(mdhamvi)-10,
         mdcphone = as.integer(mdhaph)-10,
         mdscripts = as.integer(mdhapm)-10,
         mdroof = as.integer(mdharg)-10,
         mdsavings = as.integer(mdhasa)-10,
         mdsecure = as.integer(mdhash)-10,
         mdlocks = as.integer(mdhasl)-10,
         mdfood = as.integer(mdhasm)-10,
         mdwarmclothes = as.integer(mdhawc)-10,
         mdwasher = as.integer(mdhawm)-10)



# 3. mutate each new material deprivation column to binary, where 1 = deprived #

mat_deprivation <- mat_deprivation %>%
  mutate(mdschoolclothes1 = ifelse(mdschoolclothes == 1, 1, 0),
         mdchilddentist1 = ifelse(mdchilddentist == 1, 1, 0),
         mdchildhobby1 = ifelse(mdchildhobby == 1, 1, 0),
         mdchildbeds1 = ifelse(mdchildbeds == 1, 1, 0),
         mdchildschooltrip1 = ifelse(mdchildschooltrip == 1, 1, 0),
         mddentist1 = ifelse(mddentist == 1, 1, 0),
         mdfurniture1 = ifelse(mdfurniture == 1, 1, 0),
         mdcatchup1 = ifelse(mdcatchup == 1, 1, 0),
         mdheating1 = ifelse(mdheating == 1, 1, 0),
         mdhcinsurance1 = ifelse(mdhcinsurance == 1, 1, 0),
         mdmedical1 = ifelse(mdmedical == 1, 1, 0),
         mdvehicle1 = ifelse(mdvehicle == 1, 1, 0),
         mdcarinsure1 = ifelse(mdcarinsure == 1, 1, 0),
         mdcphone1 = ifelse(mdcphone == 1, 1, 0),
         mdscripts1 = ifelse(mdscripts == 1, 1, 0),
         mdroof1 = ifelse(mdroof == 1, 1, 0),
         mdsavings1 = ifelse(mdsavings == 1, 1, 0),
         mdsecure1 = ifelse(mdsecure == 1, 1, 0),
         mdlocks1 = ifelse(mdlocks == 1, 1, 0),
         mdfood1 = ifelse(mdfood == 1, 1, 0),
         mdwarmclothes1 = ifelse(mdwarmclothes == 1, 1, 0),
         mdwasher1 = ifelse(mdwasher == 1, 1, 0))



# 4. mutate new column for total deprivation score: sum of all the binary columns created above #
# (I couldn't figure out how to do this shorthand i.e. by summing across the range of columns, so had to write them all out) #

mat_deprivation <- mat_deprivation %>%
  mutate(mdtotal = mdschoolclothes1 + mdchilddentist1 + mdchildhobby1 
         + mdchildbeds1 + mdchildschooltrip1 + mddentist1 
         + mdfurniture1 + mdcatchup1 + mdheating1 
         + mdhcinsurance1 + mdmedical1 + mdvehicle1 
         + mdcarinsure1 + mdcphone1 + mdscripts1 
         + mdroof1 + mdsavings1 + mdsecure1 
         + mdlocks1 + mdfood1 + mdwarmclothes1 + mdwasher1)



# 5. mutate new binary columns for "deprived of 2+ essentials" and "deprived of 3+ essentials" #

mat_deprivation <- mat_deprivation %>%
  mutate(deprived2 = ifelse(mdtotal > 1, 1, 0),
         deprived3 = ifelse(mdtotal > 2, 1, 0))

View(mat_deprivation)

# 6. create new tbl only for respondents in income poverty and cut out intermediate variables (for determining deprivation score) #

poverty_dep <- mat_deprivation %>%
  select(hh_id, hh_wt, rp_wt, inc_poverty, mdtotal, deprived2, deprived3) %>%
  filter(inc_poverty == 1)

View(poverty_dep)

# 7. Calculate average deprivation score for people in income poverty #

poverty_dep %>%
  summarise(mean_ds = wtd.mean(mdtotal, weights=hh_wt))

### Result: 1.12 ###



# 8. calculate proportion of people in income poverty who are deprived of >=2 and >=3 essentials #

poverty_dep %>%
  summarise(perc_deprived2 = wtd.mean(deprived2, weights=hh_wt))

### Result: 29.1% ###


poverty_dep %>%
  summarise(perc_deprived3 = wtd.mean(deprived3, weights=hh_wt))

### Result: 18.7% ###



# 9. create new tbl only for respondents who are deprived of >=2 essentials and cut out intermediate variables #

deprivation_pov <- mat_deprivation %>%
  select(hh_id, hh_wt, rp_wt, inc_poverty, mdtotal, deprived2, deprived3) %>%
  filter(deprived2 == 1)

View(deprivation_pov)



# 10. Calculate proportions of 2-deprived and 3-deprived people who are in income poverty #

deprivation_pov %>%
  summarise(dep2_poverty = wtd.mean(inc_poverty, weights=hh_wt))

### Result: 24.9% ###


deprivation_pov <- deprivation_pov %>%
  filter(deprived3 == 1)

deprivation_pov %>%
  summarise(dep3_poverty = wtd.mean(inc_poverty, weights=hh_wt))

### Result: 28.4% ###



# 11. Finally, calculate what % of all people are both in income poverty AND 2-deprived (and then 3-deprived)  #

mat_deprivation <- mat_deprivation %>%
  mutate(dep2_plus_poverty = ifelse(deprived2 == 1 & inc_poverty == 1, 1, 0),
         dep3_plus_poverty = ifelse(deprived3 == 1 & inc_poverty == 1, 1, 0))

mat_deprivation %>%
  summarise(perc_overlap2 = wtd.mean(dep2_plus_poverty, weights=hh_wt))

### Result: 2.9% ###


mat_deprivation %>%
  summarise(perc_overlap3 = wtd.mean(dep3_plus_poverty, weights=hh_wt))

### Result: 1.9% ###