# Productivity Commission 2018 analysis for the Rising Inequality? Commission Research Paper

# HILDA variable construction - will work with the Cross sec, Longitudinal and short longitudinal data files.



#########################################################
# FUNCTIONS

# define preferred functions where multiples exist (e.g. wtd.mean as Hmisc version so doesn't clash with a different version in another package - reldist)
wtd.mean <- Hmisc::wtd.mean
wtd.quantile <- Hmisc::wtd.quantile
wtd.var <- Hmisc::wtd.var
wtd.table <- questionr::wtd.table
gini <- reldist::gini
select <- dplyr::select
lag <- dplyr::lag
distinct <- dplyr::distinct

#######################################################
# CPI - input and add to dataframe

#generate CPI multipler variable to convert to 2016-17 dollars (see CPI calculations spreadsheet in data folder)
CPI <- read_excel(here("CPI.xlsx"), sheet="Deflators")

# give each observation its correct deflator
dat <-left_join(dat, CPI, by="year")
# remove CPI dataframe
rm(CPI)

#####################################
# LABELS
#####################################



# Financial year lables - accounting for lag

inc_fin_year_labels = as_labeller(c('1994' = "1994-95",
                                    '1996' = "1996-97",
                                    '1997' = "1996-97",
                                    '1998' = "1997-98",
                                    '2000' = "2000-01",
                                    '2001' = "2000-01",
                                    '2002' = "2001-02",
                                    '2003' = "2002-03",
                                    '2004' = "2003-04",
                                    '2005' = "2004-05",
                                    '2006' = "2005-06",
                                    '2007' = "2006-07",
                                    '2008' = "2007-08",
                                    '2009' = "2008-09",
                                    '2010' = "2009-10",
                                    '2011' = "2010-11",
                                    '2012' = "2011-12",
                                    '2013' = "2012-13",
                                    '2014' = "2013-14",
                                    '2015' = "2014-15",
                                    '2016' = "2015-16"))

wealth_fin_year_labels = as_labeller(c('1988' = "1988-89",
                                       '1990' = "1990-91",
                                       '1991' = "1991-92",
                                       '1992' = "1992-93",
                                       '1993' = "1993-94",
                                       '1994' = "1994-95",
                                       '1996' = "1996-97",
                                       '1997' = "1997-98",
                                       '1998' = "1998-99",
                                       '2000' = "2000-01",
                                       '2001' = "2001-02",
                                       '2002' = "2002-03",
                                       '2003' = "2003-04",
                                       '2004' = "2004-05",
                                       '2005' = "2005-06",
                                       '2006' = "2006-07",
                                       '2007' = "2007-08",
                                       '2008' = "2008-09",
                                       '2009' = "2009-10",
                                       '2010' = "2010-11",
                                       '2011' = "2011-12",
                                       '2012' = "2012-13",
                                       '2013' = "2013-14",
                                       '2014' = "2014-15",
                                       '2015' = "2015-16",
                                       '2016' = "2016-17"))

short_wealth_fin_year_labels = as_labeller(c('1988' = "'88-89",
                                             '1993' = "'93-94",
                                             '1998' = "'98-99",
                                        '2001' = "'01-02",
                                       '2002' = "'02-03",
                                       '2003' = "'03-04",
                                       '2004' = "",
                                       '2005' = "'05-06",
                                       '2006' = "'06-07",
                                       '2007' = "'07-08",
                                       '2008' = "",
                                       '2009' = "'09-10",
                                       '2010' = "'10-11",
                                       '2011' = "'11-12",
                                       '2012' = "",
                                       '2013' = "'13-14",
                                       '2014' = "'14-15",
                                       '2015' = "'15-16",
                                       '2016' = "'16-17"))



inc_dec_labels = as_labeller(c(`1` = "Bottom",
                               `2` = "2", 
                               `3` = "3",
                               `4` = "4", 
                               `5` = "5",
                               `6` = "6",
                               `7` = "7",
                               `8` = "8",
                               `9` = "9",
                               `10` = "Top"))


son_position_labels = as_labeller(c(`bottom_decile` = "Bottom decile",
                                    `bottom2_deciles` = "Bottom two deciles", 
                                    `top_5_deciles` = "Top half of\nthe distribution",
                                    `top2_deciles` = "Top two deciles", 
                                    `xTop_decile` = "Top decile"))

father_percentile_labels = as_labeller(c('_5th'="5th",
                                         '20th'="20th",
                                         '50th'="50th",
                                         '80th'="80th",
                                         '95th'="95th" ))






income_types <- c("eq_lab_inc", "eq_cap_inc", "eq_trans_inc", "eq_inc_tax")

age_groups <- c("under15s",
                "15to24",
                "25to34",
                "35to44",
                "45to54",
                "55to64",
                "65plus")

HESP_age_groups <- c("15to24",
                     "25to34",
                     "35to44",
                     "45to54",
                     "55to64",
                     "65plus")

period_labels = as_labeller(c(`2010` = "'03-04 to '09-10", 
                              `2016` = "'09-10 to '15-16"))

HESP_period_labels = as_labeller(c( `2010` = "'03-04 to '09-10", 
                                   `2016` = "'09-10 to '15-16"))

income_labels = as_labeller(c(eq_disp_inc = "Disposable income",
                              eq_priv_inc = "Private income", 
                              eq_gross_inc = "Gross income"))

consumption_gini_labels= as_labeller(c(eq_disp_inc = "Disposable income",
                                       eq_cons = "Final consumption", 
                                       eq_cons_exp = "Consumption expenditure",
                                       eq_cons_no_inkind = "Private consumption"))


consumption_labels = as_labeller(c(eq_cons = "Private consumption",
                                   eq_disp_inc = "Disposable income"))


inkind_type_labels= as_labeller(c(eq_inkind_health = "Health",
                                  eq_inkind_educ = "Education", 
                                  eq_inkind_welfare = "Welfare",
                                  eq_inkind_childcare = "Childcare",
                                  eq_inkind_govt_rent = "Government housing"))

marginal_effect_labels = as_labeller(c(gross_less_cap = "Capital income",
                                       gross_less_lab = "Labour income", 
                                       gross_less_trans = "Transfer income",
                                       disp_less_tax = "Income tax"))


inc_dec_labels = as_labeller(c(`1` = "Bottom",
                               `2` = "2", 
                               `3` = "3",
                               `4` = "4", 
                               `5` = "5",
                               `6` = "6",
                               `7` = "7",
                               `8` = "8",
                               `9` = "9",
                               `10` = "Top"))

wealth_dec_labels = inc_dec_labels


income_type_labels = as_labeller(c("eq_lab_inc" = "Labour income",
                                   "eq_cap_inc" = "Capital income", 
                                   "eq_trans_inc" = "Transfer income",
                                   "eq_inc_tax" = "Income tax"))

wealth_type_labels = as_labeller(c("eq_home_equity" = "Owner occupied\nhousing",
                                   "eq_other_property_equity" = "Other property", 
                                   "eq_super" = "Super",
                                   "eq_business" = "Business",
                                   "eq_financial_equity" = "Financial",
                                   "eq_vehicle_equity" = "Vehicle",
                                   "eq_personal_equity" = "Personal"))

wealth_type_labels_consolidated = as_labeller(c("eq_home_equity" = "Owner occupied\nhousing",
                                                "eq_super" = "Superannuation",
                                                "eq_other" = "Other"))


household_type_labels = as_labeller(c("family_employed_1" = "Family, 1 income",
                                      "family_employed_2" = "Family, 2+ incomes", 
                                      "family_unemployed" = "Family, no paid work",
                                      "retiree_other" = "Retiree, no pension",
                                      "retiree_pension" = "Retiree, receiving pension",
                                      "working_age_employed" = "Working age, employed",
                                      "working_age_unemployed" = "Working age, no paid work"))

household_type_labels_poverty = as_labeller(c("family_employed" = "Family,\n1+ employed",
                                              "family_unemployed" = "Family,\nno paid work",
                                              "retiree" = "Retiree",
                                              "working_age_employed" = "Working age,\nemployed",
                                              "working_age_unemployed" = "Working age,\nno paid work"))


household_type_labels_spaced = as_labeller(c("family_employed_1" = "Family,\n1\nincome",
                                             "family_employed_2" = "Family,\n2+\nincomes", 
                                             "family_unemployed" = "Family,\nno\npaid work",
                                             "retiree_other" = "Retiree,\nno\npension",
                                             "retiree_pension" = "Retiree,\nreceiving\npension",
                                             "working_age_employed" = "Working\nage,\nemployed",
                                             "working_age_unemployed" = "Working\nage,\nno\npaid work"))


age_group_labels = as_labeller(c("under15s" = "Under 15",
                                 "15to24" = "15 to 24", 
                                 "25to34" = "25 to 34",
                                 "35to44" = "35 to 44",
                                 "45to54" = "45 to 54",
                                 "55to64" = "55 to 64",
                                 "65plus" = "65+"))

HESP_age_group_labels = as_labeller(c("15to24" = "15 to 24", 
                                      "25to34" = "25 to 34",
                                      "35to44" = "35 to 44",
                                      "45to54" = "45 to 54",
                                      "55to64" = "55 to 64",
                                      "65plus" = "65+"))


addline_format <- function(x,...){
  gsub('\\s','\n',x)
}

spaced_income_type_labels = as_labeller(addline_format(c("eq_lab_inc" = "Labour income",
                                                         "eq_cap_inc" = "Capital income", 
                                                         "eq_trans_inc" = "Transfer income",
                                                         "eq_inc_tax" = "Income tax")))


poverty_labels= as_labeller(c(income = "Income",
                              consumption = "Final consumption", 
                              financial_Headey_liquid_assets = "Financial",
                              cons_no_in_kind = "Private consumption"))




########################################
# HES years - 3 years that HES overlaps with HILDA
HES_years <-  c(2004, 2010, 2016)



#################
# cleaner variable names for age and houshold id, and household population weight, and responding person weight, and whether responding person, year of birth
dat <- dat %>% 
  mutate(age = hgage, # age
         hh_id = hhrhid, # household id
         hh_wt = hhwth, # household/enumerated person weight
         hhwt = hh_wt, # to match with HES
         rp_wt = hhwtrp, # responding person weight
         birth_year = hgyob) 

dat <- dat %>% 
  mutate(rp = as.integer(hgint)-11 )       # responding person (1 = yes)



#####################################################
# EQUIVALENCE SCALES

# generate child variable
dat <-  mutate(dat, hhchild = hh0_4 + hh5_9 + hh10_14)
# delete age based child variables
dat <- within(dat, rm(hh0_4, hh5_9, hh10_14))

# generate OECD-modified equivalence scale variable
dat <- mutate(dat, OECD_mod = 1 + ((hhadult-1)*(0.5)) + (hhchild*(0.3)) )

# Square root equivalence scale
dat <- mutate(dat, square_root_eq = sqrt(hhadult + hhchild))

#####################################################
# INCOME VARIABLES
# regular HILDA income variables are used (not total income, which include irregular income) to match with the ABS, and because more consistent prior to 2012
# see Appendix in Wilkins (2014) 'Derived income variables in the HILDA survey' for explanation, and http://www.abs.gov.au/ausstats/abs@.nsf/Lookup/by%20Subject/6503.0~2015-16~Main%20Features~Income~2

# We used the restricted (non-top-coded) version of HILDA - just for household disposable income
# The restricted household disposable income values are pulled from a different file, not using PanelWhiz. the script is called 
# But our code will also work using the general (non-restricted) release.

##################################################
# DISPOSABLE INCOME (general release)
# generate real household disposable regular income variables (adjusted for inflation to 2016-17 dollars) - using top coded values.
# dat <-  mutate(dat, hifdip_rl = hifdip*deflator)
# dat <-  mutate(dat, hifdin_rl = hifdin*deflator)
#################################################

################################################# (skip this bit if using general release)
# DISPOSABLE INCOME (restricted release)
# read in R type file (income file - not top coded or bottom coded)
HILDA_inc_res <- readRDS(file="H:\\Longitudinal Surveys\\Hilda R16 restricted\\hh_disp_inc_res.rds") 

# join to main data file
dat <- left_join(dat, HILDA_inc_res, by=c("xwaveid", "year") )

# generate real household disposable reguler income variables (adjusted for inflation to 2016-17 dollars) - using top coded values.
dat <-  mutate(dat, hifdip_rl = hifdip_res*deflator)
dat <-  mutate(dat, hifdin_rl = hifdin_res*deflator)
###################################################

# From this point either the general release or restricted release disposable income variables will work the same.

# generate summed household disposable regular income variables (positive minus negative)
dat <-  mutate(dat, hh_disp_inc = hifdip_rl - hifdin_rl)
# generate equivalised disposable income variables
dat <-  mutate(dat, eq_disp_inc = hh_disp_inc/OECD_mod)
dat <-  mutate(dat, eq_disp_incp = hifdip_rl/OECD_mod)
dat <-  mutate(dat, eq_disp_incn = hifdin_rl/OECD_mod)


# equivalised disposable income using SQUARE ROOT scale for calculating ginis
dat <- mutate(dat, sqrt_eq_disp_inc = hh_disp_inc/square_root_eq)

# GROSS
# generate real household gross regular income variables (adjusted for inflation to 2016-17 dollars)
dat <-  mutate(dat, hifefp_rl = hifefp*deflator)
dat <-  mutate(dat, hifefn_rl = hifefn*deflator)
# generate summed household gross regular income variable (positive minus negative)
dat <-  mutate(dat, hh_gross_inc = hifefp_rl - hifefn_rl)
# generate equivalised gross income variables
dat <-  mutate(dat, eq_gross_inc = hh_gross_inc/OECD_mod)
dat <-  mutate(dat, eq_gross_incp = hifefp_rl/OECD_mod)
dat <-  mutate(dat, eq_gross_incn = hifefn_rl/OECD_mod) 

# MARKET (PRIVATE)
# using household regular private income (which includes private transfers) to match standard ABS definitions and Greenville 2013 (I think - Josh to confirm)
# generate real household regular private income variables (adjusted for inflation to 2016-17 dollars)
dat <-  mutate(dat, hifpiip_rl = hifpiip*deflator)
dat <-  mutate(dat, hifpiin_rl = hifpiin*deflator)
# generate summed household private regular income variable (positive minus negative)
dat <-  mutate(dat, hh_priv_inc = hifpiip_rl - hifpiin_rl)
# generate equivalised regular private income
dat <-  mutate(dat, eq_priv_inc = hh_priv_inc/OECD_mod)
dat <-  mutate(dat, eq_priv_incp = hifpiip_rl/OECD_mod)
dat <-  mutate(dat, eq_priv_incn = hifpiin_rl/OECD_mod)

# LABOUR
# genereate inflation adjusted labour income variable (no negative for labour income)
dat <-  mutate(dat, hh_lab_inc = hiwsfei*deflator)
# generate equivalised labour income variable
dat <-  mutate(dat, eq_lab_inc = hh_lab_inc/OECD_mod)

# Break down disposable income into it's four constituent parts (labour, other private, govt transfers, tax)
dat <- mutate(dat, eq_other_priv_inc = eq_priv_inc - eq_lab_inc)
dat <- mutate(dat, eq_govt_trans = eq_gross_inc - eq_priv_inc)
dat <- mutate(dat, eq_tax = eq_disp_inc - eq_gross_inc)

# CAPITAL
dat <- mutate(dat, eq_cap_inc = eq_priv_inc - eq_lab_inc)   # same as definition for other private income
dat <- mutate(dat, eq_trans_inc = eq_gross_inc - eq_priv_inc)   # same as eq_govt_trans

# Additional income variables
dat <- mutate(dat, eq_bus_inc = ((hibifip-hibifin)*deflator)/OECD_mod)
dat <- mutate(dat, eq_invest_inc = ((hifinip-hifinin)*deflator)/OECD_mod)
dat <- mutate(dat, eq_priv_pensions = (hifppi*deflator)/OECD_mod )
dat <- mutate(dat, eq_priv_trans = (hifpti*deflator)/OECD_mod )


###############################################################
# WEALTH VARIABLES
################################################################

# WEALTH YEARS - 4 years that wealth module is included in HILDA
wealth_years <-  c(2002, 2006, 2010, 2014)

# overdue household bills - set equal to 0 for 2002 (no overdue household bill data collected in that year)
dat <-  mutate(dat, eq_overdue_bills = ifelse(year==2002, 0, (hwobdti*deflator_lag)/OECD_mod) )

# NET WORTH
# adjust for inflation and equivalise (and substract overdue bills which are not recorded for 2002)

# positive wealth
dat <-  mutate(dat, eq_wealthp = (hwnwip*deflator_lag)/OECD_mod - eq_overdue_bills)
# need to put eq_wealthp back to 0 for those with negative net wealth
dat <-  mutate(dat, eq_wealthp = ifelse(eq_wealthp < 0, 0, eq_wealthp))

# negative wealth
dat <- mutate(dat, eq_wealthn = (hwnwin*deflator_lag)/OECD_mod + eq_overdue_bills)
# need to put eq_wealthn back to 0 for those with positive net wealth
dat <-  mutate(dat, eq_wealthn = ifelse(eq_wealthp==0, eq_wealthn, 0))

# generate summed household wealth variable (positive minus negative)      
dat <-  mutate(dat, eq_wealth = eq_wealthp - eq_wealthn)


# UNequivalised wealth variables
dat <- mutate(dat, uneq_wealth = eq_wealth*OECD_mod)


# equivalised REAL wealth variables

# various investments, and total
dat <-  mutate(dat, eq_equity_and_cash = ((hwcaini+hweqini)*deflator_lag)/OECD_mod) # equity investments plus cash investments (not cash holdings)
dat <-  mutate(dat, eq_trust_funds = (hwtrusi*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_investments = eq_equity_and_cash + eq_trust_funds)

# home asset, debt, equity
dat <-  mutate(dat, eq_home = (hwhmvai*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_home_debt = (hwhmdti*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_home_equity = eq_home - eq_home_debt)

# other property asset, debt, equity
dat <-  mutate(dat, eq_other_property = (hwopvai*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_other_property_debt = (hwopdti*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_other_property_equity = eq_other_property - eq_other_property_debt)

# business equity = business assets minus business debt
dat <-  mutate(dat, eq_business_equity = ((hwbusvi-hwbusdi)*deflator_lag)/OECD_mod) 

# various assets
dat <-  mutate(dat, eq_bank = (hwtbani*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_super = (hwsupei*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_life_insurance = (hwinsui*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_collect = (hwcolli*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_vehicles = (hwvechi*deflator_lag)/OECD_mod)

# various debts (put in negatives)
dat <-  mutate(dat, eq_credit_card = -(hwccdti*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_HECS = -(hwhecdi*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_other_personal_debt = -(hwothdi*deflator_lag)/OECD_mod)

# 7 WEALTH categories
# eq_home_equity
# eq_super
# eq_other_property_equity
# eq_business_equity
# eq_vehicles
dat <-  mutate(dat, eq_financial_wealth = eq_investments + eq_bank + eq_life_insurance + eq_other_personal_debt)
dat <-  mutate(dat, eq_personal_wealth = eq_collect + eq_credit_card + eq_HECS)

# Other wealth
dat <- mutate(dat, eq_other_wealth = eq_wealth - eq_super - eq_home_equity)

# real, UNequivalised wealth variables

dat <- mutate(dat, hh_home_equity = eq_home_equity*OECD_mod)
dat <-  mutate(dat, hh_super = eq_super*OECD_mod)
dat <- mutate(dat, hh_wealth = eq_wealth*OECD_mod)


# debt (not including business debt, which is netted as business equity)
dat <- mutate(dat, eq_debt = eq_home_debt + eq_other_property_debt + eq_HECS + eq_credit_card + eq_other_personal_debt)


# liquid assets for Headey poverty test
dat <- dat %>% mutate(eq_liquid_assets = eq_equity_and_cash + eq_bank + ifelse(hhold>=65,eq_super,0)) #hhold is age of oldest person in the house


###################################################################################
# DEMOGRAPHIC CATEGORIES

######################
# dependent children

# add up total number of dependent children
dat <- dat %>% 
  mutate(depend_child = hhd0_4 + hhd5_9 + hhd1014 + hhd1524 )

# generate variable for number of dependent children in household
dat_depend_child <- dat %>%
  select(year, hh_id, depend_child) %>% 
  arrange(year, hh_id, desc(depend_child) ) %>% 
  distinct(year, hh_id, .keep_all=TRUE) %>%   # keep_all=True ensures that we keep the depend_child variable
  rename(hh_depend_child = depend_child)

# add to main dataframe variable for number of dependent children in the household - hh_depend_child
dat <- left_join(dat, dat_depend_child, by=c("year", "hh_id") )

rm(dat_depend_child)

################################
# check to see if everyone in the house is over 65

dat_all_65_plus <- dat %>%
  select(year, hh_id, xwaveid, age) %>%
  arrange(year, hh_id, age) %>%  
  distinct(year, hh_id, .keep_all=TRUE) %>%   # only keep the age of the person in the house
  mutate(all_65_plus = ifelse(age >= 65, yes=1, no=0)) %>% 
  select(year, hh_id, all_65_plus)

# add to main dataframe variable for whether all members are 65 plus.
dat <- left_join(dat, dat_all_65_plus, by=c("year", "hh_id") )

rm(dat_all_65_plus)

################################
# number of employed people in the household - NOTE that quite a few adults in household are non-respondents, treated as not working.

dat <- dat %>% mutate(lab_status = esbrd,
                      lab_status_int = as.integer(lab_status) - 10,  # as integer 1,2,3 read as 11,12,13
                      employed = ifelse(lab_status_int==1, yes=1, no=0) ) 

dat_employed <- dat %>% 
  select(year, hh_id, employed) %>% 
  group_by(year, hh_id) %>% 
  summarise(hh_employed = sum(employed))

dat <- left_join(dat, dat_employed, by=c("year", "hh_id") )

rm(dat_employed)

#################################
# does anyone in the household receive benefits.

dat <- dat %>% mutate(any_benefit = ifelse(hicapi > 0, yes=1, no=0) ) # imputed current weekly Australian public transfers for household

dat_benefit <- dat %>%
  select(year, hh_id, any_benefit) %>% 
  arrange(year, hh_id, desc(any_benefit)) %>%  # 
  distinct(year, hh_id, .keep_all=TRUE) %>% #only keep the top (if any has benefits, this one will)
  rename(hh_benefit = any_benefit)

dat <- left_join(dat, dat_benefit, by=c("year", "hh_id") )

rm(dat_benefit)


###################################
# put all together to categorize each person according to their family type.

# have put eq_lab_inc==0 in here to tighten up definitions of not employed criteria to account for non-responding adults

dat <- dat %>%
  mutate(household_type = case_when(
    hh_depend_child>0 & (hh_employed==1 | (hh_employed==0 & eq_lab_inc>0)) ~ 1, # family employed 1
    hh_depend_child>0 & hh_employed>=2 ~ 2, # family employed 2 or more
    hh_depend_child>0 & hh_employed==0 & eq_lab_inc==0 ~ 3, # family not employed
    hh_depend_child==0 & all_65_plus>0 & hh_employed==0 & eq_lab_inc==0 & any_benefit==0  ~ 4, # retiree self-funded
    hh_depend_child==0 & all_65_plus>0 & hh_employed==0 & eq_lab_inc==0 & any_benefit>0 ~ 5, # pensioners
    hh_depend_child==0 & (hh_employed>=1 | (hh_employed==0 & eq_lab_inc>0)) ~ 6, # working age employed
    hh_depend_child==0 & all_65_plus==0 & hh_employed==0 & eq_lab_inc==0 ~ 7, # working age not employed
    TRUE ~ -99) )

dat <- mutate(dat, household_type = factor(household_type, labels=c("family_employed_1", "family_employed_2", "family_unemployed", "retiree_other", 
                                                                    "retiree_pension", "working_age_employed", "working_age_unemployed") ) )


# household types as strings

dat <- dat %>%
  mutate(household_type_string = case_when(
    hh_depend_child>0 & (hh_employed==1 | (hh_employed==0 & eq_lab_inc>0)) ~ "family_employed_1", 
    hh_depend_child>0 & hh_employed>=2 ~ "family_employed_2", 
    hh_depend_child>0 & hh_employed==0 & eq_lab_inc==0 ~ "family_not_employed", 
    hh_depend_child==0 & all_65_plus>0 & hh_employed==0 & eq_lab_inc==0 & any_benefit==0  ~ "retiree_other", 
    hh_depend_child==0 & all_65_plus>0 & hh_employed==0 & eq_lab_inc==0 & any_benefit>0 ~ "retiree_pension", 
    hh_depend_child==0 & (hh_employed>=1 | (hh_employed==0 & eq_lab_inc>0)) ~ "working_age_employed", 
    hh_depend_child==0 & all_65_plus==0 & hh_employed==0 & eq_lab_inc==0 ~ "working age unemployed", 
    TRUE ~ as.character(NA) ) )

dat <- dat %>% mutate(household_type_poverty = case_when(household_type_string ==  "family_employed_1" |  household_type_string ==  "family_employed_2" ~ "family_employed",
                                                           household_type_string ==  "retiree_pension" |  household_type_string ==  "retiree_other" ~ "retiree",
                                                           TRUE ~ household_type_string))




##############################################################
# AGE RANGES
#################################################################



# create age group variable as factor, and label factors.
dat <- dat %>% 
  mutate(age_group = case_when(
    hgage < 15 ~ 1, # under 15
    hgage >= 15 & hgage < 25 ~ 2, #15 to 24
    hgage >= 25 & hgage < 35 ~ 3, #25 to 34
    hgage >= 35 & hgage < 45 ~ 4, #35 to 44
    hgage >= 45 & hgage < 55 ~ 5, #45 to 54
    hgage >= 55 & hgage < 65 ~ 6, #55 to 64
    hgage >= 65 ~ 7, #65 plus
    TRUE ~ -99
  )
  )
dat <- mutate(dat, age_group = factor(age_group, labels=c("under 15", "15 to 24", "25 to 34", "35 to 44", "45 to 54", "55 to 64", "65 plus")))



########################################################################################################################

dat <- dat %>%
  mutate("under15s" = ifelse(hgage < 15, 1, 0),
    "15to24" = ifelse(hgage >= 15 & hgage < 25,1,0),
    "25to34" = ifelse(hgage >= 25 & hgage < 35, 1, 0),
    "35to44" = ifelse(hgage >= 35 & hgage < 45, 1, 0),
    "45to54" = ifelse(hgage >= 45 & hgage < 55, 1, 0),
    "55to64" = ifelse(hgage >= 55 & hgage < 65, 1, 0),
    "65plus" = ifelse(hgage >= 65, 1, 0) )

dat <- dat %>% 
  mutate(age_group_string = case_when(hgage < 15 ~ "Under 15",
                                              `15to24` == 1 ~ "15 to 24",
                                               `25to34` == 1 ~ "25 to 34",
                                               `35to44` == 1 ~ "35 to 44",
                                               `45to54` == 1 ~ "45 to 54",
                                               `55to64` == 1 ~ "55 to 64",
                                               `65plus` == 1 ~ "65+"))

########################################################
# AGE COHORT VARIABLEs

#Create birth decade variable
dat <- dat %>% mutate("birth_decade" = case_when(birth_year > 1909 & birth_year < 1920 ~ "1910s",
                                                 birth_year > 1919 & birth_year < 1930 ~ "1920s",
                                                 birth_year > 1929 & birth_year < 1940 ~ "1930s",
                                                 birth_year > 1939 & birth_year < 1950 ~ "1940s",
                                                 birth_year > 1949 & birth_year < 1960 ~ "1950s",
                                                 birth_year > 1959 & birth_year < 1970 ~ "1960s",
                                                 birth_year > 1969 & birth_year < 1980 ~ "1970s",
                                                 birth_year > 1979 & birth_year < 1990 ~ "1980s",
                                                 birth_year > 1989 & birth_year < 2000 ~ "1990s",
                                                 TRUE ~ NA_character_)) 


################################################################
# CONSUMPTION VARIABLES
###############################################################

# CONSUMPTION YEARS -years that have good consumption data
cons_years <-  as.numeric(c(2006:2016))

#############
#RENT
# annualised rent
dat <- mutate(dat, hh_act_rent = 12*hsrnti)

# annualised expected rent for those living rent free
dat <- mutate(dat, hh_exp_rent = 52*hsfa)

# imputed rent using house valu - defined as equal to 5% of house value - for those that own/paying off home
dat <- mutate(dat, hh_imp_rent = 0.05*hsvalui) 

# put it all together in one household rent variable (I have checked that only rent-buy people (see variable hstenr (hstenur for wave 1)) are positive in more than one of these three measures)
dat <-  dat %>%
  mutate(hh_rent = case_when(
    hh_act_rent >= 0 & hh_imp_rent <= 0 ~ hh_act_rent,   # renters, excluding those in rent to buy schemes. I HAVE NOT added household repairs and maintenance to rent paid
    hh_imp_rent > 0                   ~ hh_imp_rent,  # home owners and those paying off mortage, and those in rent to buy schemes (use imputed rent in preference to actual rent - which may be contributing to capital)
    hh_exp_rent >= 0  & hh_imp_rent <= 0 ~ hh_exp_rent, # for those living rent free, expected rent
    TRUE ~ -1                                         # call rent -1 for those with missing values - 102 people in 2016
  )
  )


######################################################################################
# build up consumption variables in wave 6-16 (patchy consumption data for earlier years) - using all imputed variables

# make years integer - possibly needed for case_when's to work
# dat <- mutate(dat, year=as.integer(year))

# 'food' type expenditure
dat <-  mutate(dat, hh_food = hxygrci # groceries
               + hxyalci # alcohol
               + hxycigi # cigarettes and tobacco 
               + hxymli) # meals eaten out

# non-durable transport expenditure
dat <-  mutate(dat, hh_transport = hxypbti # public transport and taxis
               + hxymvfi # motor vehicle fuel
               + hxymvri) # motor vehicle repairs and maintenance

# clothing expenditure
dat <- dat %>% 
  mutate(hh_clothing = case_when(
    year >= 2006 ~ hxymcfi + hxywcfi + hxyccfi, # men's + women's + children's clothing and footwear
    TRUE ~ as.integer(0) ) )

# health expenditure - private health insurance included following ABS
dat <- dat %>% 
  mutate(hh_health = case_when(
    year >= 2006 ~ hxyhlpi # fees paid to health practitioner
    + hxyphmi # medicines, presciptions and pharmaceuticals
    + hxyphii, #  private health insurance
    TRUE ~ as.integer(0) ) )

# utilities expenditure
dat <- dat %>% 
  mutate(hh_utilities = case_when(
    year >= 2006 ~  hxytlii # telephone rent and calls, internet charges
    + hxyutli, # electricity, gas bills and other heating fuel
    TRUE ~ as.integer(0) ) )



###############################
# equivalised REAL consumption variables

# stuff in main consumption figures
dat <- mutate(dat, eq_rent = (hh_rent*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_food = (hh_food*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_transport = (hh_transport*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_clothing = (hh_clothing*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_health = (hh_health*deflator_lag)/OECD_mod)
dat <- mutate(dat, eq_other_insurance = (hxyoii*deflator_lag)/OECD_mod)     
dat <-  mutate(dat, eq_utilities = (hh_utilities*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_education = (hxyedci*deflator_lag)/OECD_mod)
dat <-  mutate(dat, eq_childcare = (ccactci*deflator_lag)/OECD_mod)



#############################
# headline consumption figure -  not treating home repairs (hxyhmri) as consumption, but as investment (and then counting rent or gross imputed rent within consumption)
dat <- mutate(dat, eq_cons = eq_rent
              + eq_food
              + eq_transport
              + eq_clothing
              + eq_health
              + eq_other_insurance   #I am including other insurance in consumption, because according to HES, most insurance is vehicle insurance and other stuff (not home and contents, which we have excluded in HES)
              + eq_utilities
              + eq_education
              + eq_childcare )