Coverage for censusdis/multiyear.py: 74%
86 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
1"""Utility functions for downloading, graphing and analyzing multiple years of ACS data."""
3from collections import defaultdict
4import pandas as pd
6import matplotlib.pyplot as plt
7from matplotlib.ticker import FuncFormatter
9import censusdis.data as ced
10from censusdis.datasets import ACS1, ACS3, ACS5
12import re
14from typing import List, Optional, Union, Iterable, Callable, Dict
17def is_variable_column(
18 col: str,
19 download_variables: Optional[Union[str, Iterable[str]]],
20 group: Optional[str],
21) -> bool:
22 """
23 Determine whether a column is a Census variable.
25 There are two ways to download data from the Census Bureau: as a group (i.e. table) or
26 as a list of variables. The way to test whether a column of the resulting
27 df is a variable depends on whether the data was requested by group or variable.
29 In practice this function is used to drop columns that contain metadata that I don't need.
30 We want columns like "B01001_001E" and not like "STATE"
31 """
32 if group and col.startswith(group):
33 return True
34 if download_variables and col in download_variables:
35 return True
37 return False
40# This function is based on the notebook linked to in this github issue:
41# https://github.com/censusdis/censusdis/issues/325
42def name_mapper(
43 dataset: str,
44 vintage: int,
45 download_variables: Optional[Union[str, Iterable[str]]],
46 group: Optional[str],
47) -> Callable[[str], str]:
48 """Return a function that converts an ACS variable to its label."""
50 def inner(variable: str) -> str:
51 """Map from the variables we got back to their labels."""
52 if is_variable_column(variable, download_variables, group):
53 # Look up details of the particular variable:
54 vars = ced.variables.search(
55 dataset, vintage, group_name=group, name=variable
56 )
57 # Census uses !! to indicate nesting of Labels. Ex. 'Estimate!!Total:'
58 # We care about the last part.
59 label = vars.iloc[0]["LABEL"]
60 label = re.split(r"!!", label)[-1]
62 # Starting in 2020 Labels which are parents of other Labels have a : as a suffix.
63 # See an example here: https://data.census.gov/table?q=country%20of%20birth&g=9700000US3612510
64 # (Ex. "Total:", "Asia:", "Eastern Asia:", "China:")
65 # For my purposes, it is better to drop this trailing :
66 return label[:-1] if label[-1] == ":" else label
67 else:
68 # Not in the group we are interested in, so leave it as is.
69 return variable
71 return inner
74def get_unique_labels_for_variable(
75 acs: str, variable: str, years: List[int]
76) -> Dict[str, List[int]]:
77 """
78 Return all labels the ACS has used for a given variable.
80 Note that the ACS sometimes changes the labels of a variable. Sometimes these changes are minor,
81 and sometimes the same variable is used for something completely different. This function is designed to
82 facilitate doing this check over multiple years.
84 For example, B08006_017E in 2005 had label 'Estimate!!Total!!Motorcycle'. But in 2006 it switched to
85 'Estimate!!Total!!Worked at home'. And in 2019 it changed to 'Estimate!!Total:!!Worked from home'.
87 To reduce false positives labels are converted to lower case prior to comparison ("Race" is
88 inconsistently capitalized over the years). Also, all ":" are removed prior to comparison so that
89 "estimate!!total:!!native" is the same as "estimate!!total!!native".
91 Parameters
92 ----------
93 - acs: The ACS to use. Ex. censusdis.datasets.ACS1
94 - variable: The variable in question. Ex. 'B01001_001E'
95 - years: An iterable of years to use. Ex. [2005, 2006, 2007]
97 Returns
98 -------
99 - A dict where each key is a label, and each value is a list of years that key has been used.
101 Note: If the dict returned is of length 1, then the variable has only ever had 1 label.
102 """
103 labels = defaultdict(list)
105 for year in years:
106 label = ced.variables.get(acs, year, variable)["label"].lower()
107 label = label.replace(":", "")
109 labels[label].append(year)
111 return labels
114class VariableMistmatchOverTimeError(Exception):
115 """Raised when an ACS variable has had multiple labels over time."""
117 pass
120def warn_variable_changes(
121 df: pd.DataFrame,
122 dataset: str,
123 vintages: List[int],
124 download_variables: Optional[Union[str, Iterable[str]]],
125 group: Optional[str],
126 prompt: bool,
127) -> None:
128 """
129 Issue a warning when an ACS variable has had multiple labels over the years.
131 In the ACS, Sometimes the same variable is used for different things in different years.
132 For example in 2005 `B08006_017E` was used for "Estimate!!Total!!Motorcycle". But in 2006 it
133 changed to "Estimate!!Total!!Worked at home" and in 2019 it changed to "Estimate!!Total:!!Worked from home".
134 This code alerts users of any variables which have had different labels over time.
136 This function always emits a warning if it encounters that situation. If prompt is True
137 then it also prompts the user to confirm whether they want to continue with the download.
138 """
139 years = df["Year"].unique()
141 for col in df.columns:
142 if not is_variable_column(col, download_variables, group):
143 continue
145 unique_labels_for_variable = get_unique_labels_for_variable(
146 dataset, col, vintages
147 )
149 if len(unique_labels_for_variable) > 1:
150 print(f"Warning: {col} has had multiple labels over the selected years:")
151 for label, years in unique_labels_for_variable.items():
152 print(f"\t'{label}' in {years}")
153 if prompt:
154 if input("Continue downloading dataset (y/n)?") != "y":
155 raise VariableMistmatchOverTimeError()
158def download_multiyear(
159 dataset: str,
160 vintages: List[int],
161 download_variables: Optional[Union[str, Iterable[str]]] = None,
162 *,
163 group: Optional[str] = None,
164 rename_vars: bool = True,
165 drop_cols: bool = True,
166 prompt: bool = True,
167 **kwargs,
168) -> pd.DataFrame:
169 """
170 Download multiple years of ACS data into a single dataframe.
172 Variables can be specified individually by `download_variables` or as a table by `group`.
174 Parameters
175 ----------
176 dataset
177 Must be one of `censusdis.datasets.ACS1`, `censudis.datasets.ACS3` or `censusdis.datasets.ACS5`.
178 vintages
179 A list of years to download data for.
180 download_variables
181 The census variables to download, for example `["NAME", "B01001_001E"]`.
182 group
183 One or more groups (as defined by the U.S. Census for the data set)
184 whose variable values should be downloaded.
185 rename_vars
186 If True, rename the columns from variables (ex. "B01001_001E") to their labels (ex. "Total").
187 The labels for the last year are used.
188 drop_cols
189 If True, drop cols that do not contain variables or the year (ex. geography columns).
190 prompt
191 This function emits a warning each time a downloaded variable has had multiple labels over time.
192 If True, prompt the user whether they want to continue downloading the dataset despite the differences.
193 **kwargs
194 Geography parameters passed directly to `ced.download`.
196 Returns
197 -------
198 A dataframe.
200 Examples
201 --------
202 # Download the population by nativity in Great Neck School District, NY.
203 from censusdis.multiyear import download_multiyear
204 from censusdis.datasets import ACS5
205 from censusdis.states import NY
207 # Download the entire table.
208 df = download_multiyear(
209 dataset=ACS5,
210 vintages=[2009, 2014, 2019],
211 group="B05012",
212 prompt=False,
213 state=NY,
214 school_district_unified="12510",
215 )
217 # Downloaded selected variables.
218 df = download_multiyear(
219 dataset=ACS5,
220 vintages=[2009, 2014, 2019],
221 download_variables=["B05006_049E", "B05006_060E", "B05006_054E"],
222 state=NY,
223 school_district_unified="12510",
224 drop_cols=True,
225 )
226 """
227 if dataset not in [ACS1, ACS3, ACS5]:
228 raise ValueError(
229 "Dataset must be one of `censusdis.datasets.ACS1`, `censusdis.datasets.ACS3` or `censusdis.datasets.ACS5`"
230 )
232 if (download_variables is None and group is None) or (
233 download_variables is not None and group is not None
234 ):
235 raise ValueError("Exactly one of download_variables and group must be set.")
237 df = None
239 for vintage in vintages:
240 # This loop can take a while, so provide feedback to the user
241 print(".", end="", flush=True)
243 df_new = ced.download(
244 dataset=dataset,
245 vintage=vintage,
246 download_variables=download_variables,
247 group=group,
248 **kwargs,
249 )
251 df_new["Year"] = vintage
253 if df is None:
254 df = df_new
255 else:
256 df = pd.concat([df, df_new])
258 warn_variable_changes(df, dataset, vintages, download_variables, group, prompt)
260 if drop_cols:
261 df = df[
262 [
263 col
264 for col in df.columns
265 if is_variable_column(col, download_variables, group) or col == "Year"
266 ]
267 ]
269 if rename_vars:
270 df = df.rename(
271 columns=name_mapper(
272 dataset=dataset,
273 vintage=vintages[-1],
274 download_variables=download_variables,
275 group=group,
276 )
277 )
279 df = df.reset_index(drop=True)
280 return df
283def graph_multiyear(
284 df: pd.DataFrame,
285 title: str = "",
286 yaxis_title: str = "",
287 y_cols: Optional[Iterable[str]] = None,
288) -> None:
289 """
290 Create a (multi-line) graph of time series data.
292 Parameters
293 ----------
294 df
295 Must have a column called 'Year' which will serve as the x-axis.
296 title
297 Title for the graph.
298 yaxis_title
299 Title for the y-axis.
300 y_cols
301 A list of columns in `df` to create lines for. If None then will graph all
302 columns except "Year".
304 Returns
305 -------
306 NoneType
308 Examples
309 --------
310 # Graph the population by nativity in Great Neck School District, NY.
311 from censusdis.multiyear import download_multiyear, graph_multiyear
312 from censusdis.datasets import ACS5
313 from censusdis.states import NY
315 df = download_multiyear(
316 dataset=ACS5,
317 vintages=[2010, 2015, 2020],
318 group="B05012",
319 state=NY,
320 school_district_unified="12510",
321 )
322 graph_multiyear(
323 df,
324 "Population by Nativity in Great Neck School District",
325 "Population",
326 ["Total", "Native", "Foreign-Born"],
327 )
328 """
330 # Define a function to format the y-axis with commas
331 def format_yaxis(value, tick_position):
332 return f"{value:,.0f}"
334 if not y_cols:
335 y_cols = [col for col in df.columns if col != "Year"]
337 colorblind_palette = [
338 "#E69F00",
339 "#56B4E9",
340 "#009E73",
341 "#F0E442",
342 "#0072B2",
343 "#D55E00",
344 "#CC79A7",
345 "#999999",
346 "#E41A1C",
347 ]
349 for idx, y_col in enumerate(y_cols):
350 plt.plot(
351 df["Year"],
352 df[y_col],
353 label=y_col,
354 marker="o",
355 color=colorblind_palette[idx],
356 )
358 # Explicitly set the x-axis ticks to match the actual years in the data
359 plt.xticks(ticks=df["Year"], labels=df["Year"], rotation=-45)
361 # Apply comma formatting to the y-axis
362 plt.gca().yaxis.set_major_formatter(FuncFormatter(format_yaxis))
364 plt.xlabel("Year")
365 plt.ylabel(yaxis_title)
366 plt.title(title)
367 plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
368 plt.show()
371def pct_change_multiyear(df: pd.DataFrame) -> pd.DataFrame:
372 r"""
373 Convert a multi-year dataframe from raw counts to percent change.
375 Essentially runs pd.DataFrame.pct_change on all columns of the dataframe except the "Year" column.
376 Rounds the results to 1 decimal point.
378 Parameters
379 ----------
380 df
381 Must have a column called 'Year'.
383 Returns
384 -------
385 A Dataframe
387 Examples
388 --------
389 # Graph the percent change in the population by nativity in Great Neck School District, NY.
390 from censusdis.multiyear import (
391 download_multiyear,
392 pct_change_multiyear,
393 graph_multiyear,
394 )
395 from censusdis.datasets import ACS5
396 from censusdis.states import NY
398 df = download_multiyear(
399 dataset=ACS5,
400 vintages=[2010, 2015, 2020],
401 group="B05012",
402 prompt=False,
403 state=NY,
404 school_district_unified="12510",
405 )
407 df = pct_change_multiyear(df)
408 print(df)
410 graph_multiyear(
411 df,
412 "Percent Change in Population by Nativity\nGreat Neck School District, NY",
413 "Percent Change",
414 )
415 """
416 years = df["Year"]
418 df = df.pct_change() * 100
419 df = df.round(1)
421 df["Year"] = years
423 return df