Coverage for censusdis/multiyear.py: 74%

86 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2025-04-03 05:39 +0000

1"""Utility functions for downloading, graphing and analyzing multiple years of ACS data.""" 

2 

3from collections import defaultdict 

4import pandas as pd 

5 

6import matplotlib.pyplot as plt 

7from matplotlib.ticker import FuncFormatter 

8 

9import censusdis.data as ced 

10from censusdis.datasets import ACS1, ACS3, ACS5 

11 

12import re 

13 

14from typing import List, Optional, Union, Iterable, Callable, Dict 

15 

16 

17def is_variable_column( 

18 col: str, 

19 download_variables: Optional[Union[str, Iterable[str]]], 

20 group: Optional[str], 

21) -> bool: 

22 """ 

23 Determine whether a column is a Census variable. 

24 

25 There are two ways to download data from the Census Bureau: as a group (i.e. table) or 

26 as a list of variables. The way to test whether a column of the resulting 

27 df is a variable depends on whether the data was requested by group or variable. 

28 

29 In practice this function is used to drop columns that contain metadata that I don't need. 

30 We want columns like "B01001_001E" and not like "STATE" 

31 """ 

32 if group and col.startswith(group): 

33 return True 

34 if download_variables and col in download_variables: 

35 return True 

36 

37 return False 

38 

39 

40# This function is based on the notebook linked to in this github issue: 

41# https://github.com/censusdis/censusdis/issues/325 

42def name_mapper( 

43 dataset: str, 

44 vintage: int, 

45 download_variables: Optional[Union[str, Iterable[str]]], 

46 group: Optional[str], 

47) -> Callable[[str], str]: 

48 """Return a function that converts an ACS variable to its label.""" 

49 

50 def inner(variable: str) -> str: 

51 """Map from the variables we got back to their labels.""" 

52 if is_variable_column(variable, download_variables, group): 

53 # Look up details of the particular variable: 

54 vars = ced.variables.search( 

55 dataset, vintage, group_name=group, name=variable 

56 ) 

57 # Census uses !! to indicate nesting of Labels. Ex. 'Estimate!!Total:' 

58 # We care about the last part. 

59 label = vars.iloc[0]["LABEL"] 

60 label = re.split(r"!!", label)[-1] 

61 

62 # Starting in 2020 Labels which are parents of other Labels have a : as a suffix. 

63 # See an example here: https://data.census.gov/table?q=country%20of%20birth&g=9700000US3612510 

64 # (Ex. "Total:", "Asia:", "Eastern Asia:", "China:") 

65 # For my purposes, it is better to drop this trailing : 

66 return label[:-1] if label[-1] == ":" else label 

67 else: 

68 # Not in the group we are interested in, so leave it as is. 

69 return variable 

70 

71 return inner 

72 

73 

74def get_unique_labels_for_variable( 

75 acs: str, variable: str, years: List[int] 

76) -> Dict[str, List[int]]: 

77 """ 

78 Return all labels the ACS has used for a given variable. 

79 

80 Note that the ACS sometimes changes the labels of a variable. Sometimes these changes are minor, 

81 and sometimes the same variable is used for something completely different. This function is designed to 

82 facilitate doing this check over multiple years. 

83 

84 For example, B08006_017E in 2005 had label 'Estimate!!Total!!Motorcycle'. But in 2006 it switched to 

85 'Estimate!!Total!!Worked at home'. And in 2019 it changed to 'Estimate!!Total:!!Worked from home'. 

86 

87 To reduce false positives labels are converted to lower case prior to comparison ("Race" is 

88 inconsistently capitalized over the years). Also, all ":" are removed prior to comparison so that 

89 "estimate!!total:!!native" is the same as "estimate!!total!!native". 

90 

91 Parameters 

92 ---------- 

93 - acs: The ACS to use. Ex. censusdis.datasets.ACS1 

94 - variable: The variable in question. Ex. 'B01001_001E' 

95 - years: An iterable of years to use. Ex. [2005, 2006, 2007] 

96 

97 Returns 

98 ------- 

99 - A dict where each key is a label, and each value is a list of years that key has been used. 

100 

101 Note: If the dict returned is of length 1, then the variable has only ever had 1 label. 

102 """ 

103 labels = defaultdict(list) 

104 

105 for year in years: 

106 label = ced.variables.get(acs, year, variable)["label"].lower() 

107 label = label.replace(":", "") 

108 

109 labels[label].append(year) 

110 

111 return labels 

112 

113 

114class VariableMistmatchOverTimeError(Exception): 

115 """Raised when an ACS variable has had multiple labels over time.""" 

116 

117 pass 

118 

119 

120def warn_variable_changes( 

121 df: pd.DataFrame, 

122 dataset: str, 

123 vintages: List[int], 

124 download_variables: Optional[Union[str, Iterable[str]]], 

125 group: Optional[str], 

126 prompt: bool, 

127) -> None: 

128 """ 

129 Issue a warning when an ACS variable has had multiple labels over the years. 

130 

131 In the ACS, Sometimes the same variable is used for different things in different years. 

132 For example in 2005 `B08006_017E` was used for "Estimate!!Total!!Motorcycle". But in 2006 it 

133 changed to "Estimate!!Total!!Worked at home" and in 2019 it changed to "Estimate!!Total:!!Worked from home". 

134 This code alerts users of any variables which have had different labels over time. 

135 

136 This function always emits a warning if it encounters that situation. If prompt is True 

137 then it also prompts the user to confirm whether they want to continue with the download. 

138 """ 

139 years = df["Year"].unique() 

140 

141 for col in df.columns: 

142 if not is_variable_column(col, download_variables, group): 

143 continue 

144 

145 unique_labels_for_variable = get_unique_labels_for_variable( 

146 dataset, col, vintages 

147 ) 

148 

149 if len(unique_labels_for_variable) > 1: 

150 print(f"Warning: {col} has had multiple labels over the selected years:") 

151 for label, years in unique_labels_for_variable.items(): 

152 print(f"\t'{label}' in {years}") 

153 if prompt: 

154 if input("Continue downloading dataset (y/n)?") != "y": 

155 raise VariableMistmatchOverTimeError() 

156 

157 

158def download_multiyear( 

159 dataset: str, 

160 vintages: List[int], 

161 download_variables: Optional[Union[str, Iterable[str]]] = None, 

162 *, 

163 group: Optional[str] = None, 

164 rename_vars: bool = True, 

165 drop_cols: bool = True, 

166 prompt: bool = True, 

167 **kwargs, 

168) -> pd.DataFrame: 

169 """ 

170 Download multiple years of ACS data into a single dataframe. 

171 

172 Variables can be specified individually by `download_variables` or as a table by `group`. 

173 

174 Parameters 

175 ---------- 

176 dataset 

177 Must be one of `censusdis.datasets.ACS1`, `censudis.datasets.ACS3` or `censusdis.datasets.ACS5`. 

178 vintages 

179 A list of years to download data for. 

180 download_variables 

181 The census variables to download, for example `["NAME", "B01001_001E"]`. 

182 group 

183 One or more groups (as defined by the U.S. Census for the data set) 

184 whose variable values should be downloaded. 

185 rename_vars 

186 If True, rename the columns from variables (ex. "B01001_001E") to their labels (ex. "Total"). 

187 The labels for the last year are used. 

188 drop_cols 

189 If True, drop cols that do not contain variables or the year (ex. geography columns). 

190 prompt 

191 This function emits a warning each time a downloaded variable has had multiple labels over time. 

192 If True, prompt the user whether they want to continue downloading the dataset despite the differences. 

193 **kwargs 

194 Geography parameters passed directly to `ced.download`. 

195 

196 Returns 

197 ------- 

198 A dataframe. 

199 

200 Examples 

201 -------- 

202 # Download the population by nativity in Great Neck School District, NY. 

203 from censusdis.multiyear import download_multiyear 

204 from censusdis.datasets import ACS5 

205 from censusdis.states import NY 

206 

207 # Download the entire table. 

208 df = download_multiyear( 

209 dataset=ACS5, 

210 vintages=[2009, 2014, 2019], 

211 group="B05012", 

212 prompt=False, 

213 state=NY, 

214 school_district_unified="12510", 

215 ) 

216 

217 # Downloaded selected variables. 

218 df = download_multiyear( 

219 dataset=ACS5, 

220 vintages=[2009, 2014, 2019], 

221 download_variables=["B05006_049E", "B05006_060E", "B05006_054E"], 

222 state=NY, 

223 school_district_unified="12510", 

224 drop_cols=True, 

225 ) 

226 """ 

227 if dataset not in [ACS1, ACS3, ACS5]: 

228 raise ValueError( 

229 "Dataset must be one of `censusdis.datasets.ACS1`, `censusdis.datasets.ACS3` or `censusdis.datasets.ACS5`" 

230 ) 

231 

232 if (download_variables is None and group is None) or ( 

233 download_variables is not None and group is not None 

234 ): 

235 raise ValueError("Exactly one of download_variables and group must be set.") 

236 

237 df = None 

238 

239 for vintage in vintages: 

240 # This loop can take a while, so provide feedback to the user 

241 print(".", end="", flush=True) 

242 

243 df_new = ced.download( 

244 dataset=dataset, 

245 vintage=vintage, 

246 download_variables=download_variables, 

247 group=group, 

248 **kwargs, 

249 ) 

250 

251 df_new["Year"] = vintage 

252 

253 if df is None: 

254 df = df_new 

255 else: 

256 df = pd.concat([df, df_new]) 

257 

258 warn_variable_changes(df, dataset, vintages, download_variables, group, prompt) 

259 

260 if drop_cols: 

261 df = df[ 

262 [ 

263 col 

264 for col in df.columns 

265 if is_variable_column(col, download_variables, group) or col == "Year" 

266 ] 

267 ] 

268 

269 if rename_vars: 

270 df = df.rename( 

271 columns=name_mapper( 

272 dataset=dataset, 

273 vintage=vintages[-1], 

274 download_variables=download_variables, 

275 group=group, 

276 ) 

277 ) 

278 

279 df = df.reset_index(drop=True) 

280 return df 

281 

282 

283def graph_multiyear( 

284 df: pd.DataFrame, 

285 title: str = "", 

286 yaxis_title: str = "", 

287 y_cols: Optional[Iterable[str]] = None, 

288) -> None: 

289 """ 

290 Create a (multi-line) graph of time series data. 

291 

292 Parameters 

293 ---------- 

294 df 

295 Must have a column called 'Year' which will serve as the x-axis. 

296 title 

297 Title for the graph. 

298 yaxis_title 

299 Title for the y-axis. 

300 y_cols 

301 A list of columns in `df` to create lines for. If None then will graph all 

302 columns except "Year". 

303 

304 Returns 

305 ------- 

306 NoneType 

307 

308 Examples 

309 -------- 

310 # Graph the population by nativity in Great Neck School District, NY. 

311 from censusdis.multiyear import download_multiyear, graph_multiyear 

312 from censusdis.datasets import ACS5 

313 from censusdis.states import NY 

314 

315 df = download_multiyear( 

316 dataset=ACS5, 

317 vintages=[2010, 2015, 2020], 

318 group="B05012", 

319 state=NY, 

320 school_district_unified="12510", 

321 ) 

322 graph_multiyear( 

323 df, 

324 "Population by Nativity in Great Neck School District", 

325 "Population", 

326 ["Total", "Native", "Foreign-Born"], 

327 ) 

328 """ 

329 

330 # Define a function to format the y-axis with commas 

331 def format_yaxis(value, tick_position): 

332 return f"{value:,.0f}" 

333 

334 if not y_cols: 

335 y_cols = [col for col in df.columns if col != "Year"] 

336 

337 colorblind_palette = [ 

338 "#E69F00", 

339 "#56B4E9", 

340 "#009E73", 

341 "#F0E442", 

342 "#0072B2", 

343 "#D55E00", 

344 "#CC79A7", 

345 "#999999", 

346 "#E41A1C", 

347 ] 

348 

349 for idx, y_col in enumerate(y_cols): 

350 plt.plot( 

351 df["Year"], 

352 df[y_col], 

353 label=y_col, 

354 marker="o", 

355 color=colorblind_palette[idx], 

356 ) 

357 

358 # Explicitly set the x-axis ticks to match the actual years in the data 

359 plt.xticks(ticks=df["Year"], labels=df["Year"], rotation=-45) 

360 

361 # Apply comma formatting to the y-axis 

362 plt.gca().yaxis.set_major_formatter(FuncFormatter(format_yaxis)) 

363 

364 plt.xlabel("Year") 

365 plt.ylabel(yaxis_title) 

366 plt.title(title) 

367 plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) 

368 plt.show() 

369 

370 

371def pct_change_multiyear(df: pd.DataFrame) -> pd.DataFrame: 

372 r""" 

373 Convert a multi-year dataframe from raw counts to percent change. 

374 

375 Essentially runs pd.DataFrame.pct_change on all columns of the dataframe except the "Year" column. 

376 Rounds the results to 1 decimal point. 

377 

378 Parameters 

379 ---------- 

380 df 

381 Must have a column called 'Year'. 

382 

383 Returns 

384 ------- 

385 A Dataframe 

386 

387 Examples 

388 -------- 

389 # Graph the percent change in the population by nativity in Great Neck School District, NY. 

390 from censusdis.multiyear import ( 

391 download_multiyear, 

392 pct_change_multiyear, 

393 graph_multiyear, 

394 ) 

395 from censusdis.datasets import ACS5 

396 from censusdis.states import NY 

397 

398 df = download_multiyear( 

399 dataset=ACS5, 

400 vintages=[2010, 2015, 2020], 

401 group="B05012", 

402 prompt=False, 

403 state=NY, 

404 school_district_unified="12510", 

405 ) 

406 

407 df = pct_change_multiyear(df) 

408 print(df) 

409 

410 graph_multiyear( 

411 df, 

412 "Percent Change in Population by Nativity\nGreat Neck School District, NY", 

413 "Percent Change", 

414 ) 

415 """ 

416 years = df["Year"] 

417 

418 df = df.pct_change() * 100 

419 df = df.round(1) 

420 

421 df["Year"] = years 

422 

423 return df