Coverage for censusdis/data.py: 94%
324 statements
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
« prev ^ index » next coverage.py v6.5.0, created at 2025-04-03 05:39 +0000
1# Copyright (c) 2022, 2023 Darren Erik Vengroff
2"""
3Utilities for loading census data.
5This module relies on the US Census API, which
6it wraps in a pythonic manner.
7"""
9import warnings
10from logging import getLogger
11from typing import (
12 Dict,
13 Iterable,
14 List,
15 Mapping,
16 Optional,
17 Tuple,
18 Union,
19)
21import io
22import requests
23import gzip
25import geopandas as gpd
26import numpy as np
27import pandas as pd
29import censusdis.geography as cgeo
30import censusdis.maps as cmap
31from censusdis.impl.exceptions import CensusApiException
32from censusdis.impl.fetch import data_from_url
33from censusdis.impl.us_census_shapefiles import (
34 add_geography,
35 clip_water,
36 infer_geo_level,
37 geo_query_from_data_query_inner_geo,
38)
39from censusdis.impl.varcache import VariableCache
40from censusdis.impl.varsource.base import VintageType
41from censusdis.impl.varsource.censusapi import CensusApiVariableSource
42from censusdis.values import ALL_SPECIAL_VALUES
43from censusdis.datasets import ACS5, DECENNIAL_PUBLIC_LAW_94_171
44from censusdis.states import ABBREVIATIONS_FROM_IDS
46import censusdis.impl.fetch
49logger = getLogger(__name__)
52GeoFilterType = Optional[Union[str, Iterable[str]]]
53"""
54The type we accept for geographic filters.
56They are used for the values of `kwargs` to
57:py:func:`download`.
59These filters are either single values as a string,
60or, if multivalued, then an iterable containing all
61the values allowed by the filter. For example::
63 import censusdis.data as ced
65 from censusdis.states import NJ, NY, CT
67 # Two different kinds of kwarg for `state=`, both of
68 # which are of `GeoFilterType`:
69 df_one_state = ced.download("aca/acs5", 2020, ["NAME"], state=NJ)
70 df_tri_state = ced.download("aca/acs5", 2020, ["NAME"], state=[NJ, NY, CT])
71"""
74def _gf2s(geo_filter: GeoFilterType) -> Optional[str]:
75 """
76 Convert a filter to a string.
78 For the Census API, multiple values are encoded
79 in a single comma separated string.
80 """
81 if geo_filter is None or isinstance(geo_filter, str):
82 return geo_filter
83 return ",".join(geo_filter)
86_MAX_VARIABLES_PER_DOWNLOAD = 50
87"""
88The maximum number of variables we can ask for in one census API query.
90The U.S. Census sets this limit, not us. In order to not expose our
91users to the limit, :py:func:`~download` mostly obscures the fact that
92requests to download more than this many variables are broken into
93multiple calls to the census API and then the results are stitched back
94together be either merging or concatenation. This is all handled in
95:py:func:`~_download_multiple`.
96"""
99__dw_strategy_metrics = {"merge": 0, "concat": 0}
100"""
101Counters for how often we use each strategy for wide tables.
102"""
105def _download_wide_strategy_metrics() -> Dict[str, int]:
106 """
107 Metrics on which strategies have been used for wide tables.
109 Returns
110 -------
111 A dictionary of metrics on how often each strategy has
112 been used.
113 """
114 return dict(**__dw_strategy_metrics)
117def _download_multiple(
118 dataset: str,
119 vintage: VintageType,
120 download_variables: List[str],
121 *,
122 query_filter: Dict[str, str],
123 api_key: Optional[str],
124 census_variables: "VariableCache",
125 with_geometry: bool,
126 with_geometry_columns: bool,
127 tiger_shapefiles_only: bool,
128 row_keys: Union[str, Iterable[str]],
129 **kwargs: cgeo.InSpecType,
130) -> pd.DataFrame:
131 """
132 Download data in groups of columns and concatenate the results together.
134 The reason for this function is that the API will only return a maximum
135 of 50 columns per query. This function downloads wider data 50 columns
136 at a time and concatenates them.
138 Parameters
139 ----------
140 dataset
141 The dataset to download from. For example `"acs/acs5"`,
142 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
143 vintage
144 The vintage to download data for. For most data sets this is
145 an integer year, for example, `2020`. But for
146 a timeseries data set, pass the string `'timeseries'`.
147 download_variables
148 The census variables to download, for example `["NAME", "B01001_001E"]`.
149 query_filter
150 A dictionary of values to filter on. For example, if
151 `query_filter={'NAICS2017': '72251'}` then only rows
152 where the variable `NAICS2017` has a value of `'72251'`
153 will be returned.
155 This filtering is done on the server side, not the client
156 side, so it is far more efficient than querying without a
157 query filter and then manually filtering the results.
158 with_geometry
159 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row
160 will have a geometry that is a cartographic boundary suitable for platting
161 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
162 for details of the shapefiles that will be downloaded on your behalf to
163 generate these boundaries.
164 with_geometry_columns
165 If `True` keep all the additional columns that come with shapefiles
166 downloaded to get geometry information.
167 tiger_shapefiles_only
168 If `True` only look for TIGER shapefiles. If `False`, first look
169 for CB shapefiles
170 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
171 which are more suitable for plotting maps, then fall back on the full
172 TIGER files
173 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
174 only if CB is not available. This is mainly set to `True` only
175 when `with_geometry_columns` is also set to `True`. The reason
176 is that the additional columns in the shapefiles are different
177 in the CB files than in the TIGER files.
178 api_key
179 An optional API key. If you don't have or don't use a key, the number
180 of calls you can make will be limited.
181 variable_cache
182 A cache of metadata about variables.
183 row_keys
184 An optional set of identifier keys to help merge together requests for more than the census API limit of
185 50 variables per query. These keys are useful for census datasets such as the Current Population Survey
186 where the geographic identifiers do not uniquely identify each row.
187 kwargs
188 A specification of the geometry that we want data for.
190 Returns
191 -------
192 The full results of the query with all columns.
194 """
195 # Divide the variables into groups. If row keys are provided, include them in each chunk of variables,
196 # while respecting the variable max
197 if row_keys:
198 chunk_size = _MAX_VARIABLES_PER_DOWNLOAD - len(row_keys)
199 variable_groups = [
200 # black and flake8 disagree about the whitespace before ':' here...
201 # We need to drop duplicates in each chunk of variables
202 # since the row_key variables might already be present in one of the chunks
203 [
204 item
205 for item in row_keys
206 + download_variables[start : start + chunk_size] # noqa: E203
207 if item not in row_keys
208 or row_keys.index(item)
209 == (
210 row_keys
211 + download_variables[start : start + chunk_size] # noqa: E203
212 ).index(item)
213 ]
214 for start in range(0, len(download_variables), chunk_size)
215 ]
216 else:
217 variable_groups = [
218 # black and flake8 disagree about the whitespace before ':' here...
219 download_variables[start : start + _MAX_VARIABLES_PER_DOWNLOAD] # noqa: 203
220 for start in range(0, len(download_variables), _MAX_VARIABLES_PER_DOWNLOAD)
221 ]
223 if len(variable_groups) < 2:
224 raise ValueError(
225 "_download_multiple expects to be called with at least "
226 f"{_MAX_VARIABLES_PER_DOWNLOAD + 1} variables. With fewer,"
227 "use download instead."
228 )
230 # Get the data for each chunk. Note that we leave out
231 # extra geometry columns at this point. We will get them
232 # later if we need them, but they get in the way at this
233 # point.
234 dfs = [
235 download(
236 dataset,
237 vintage,
238 variable_group,
239 query_filter=query_filter,
240 api_key=api_key,
241 variable_cache=census_variables,
242 with_geometry=with_geometry and (ii == 0),
243 with_geometry_columns=False,
244 **kwargs,
245 )
246 for ii, variable_group in enumerate(variable_groups)
247 ]
249 # What variables came back in the first df but were not
250 # requested? These are a key to the geography the row
251 # represents. For example, 'STATE' amd 'COUNTY' might
252 # be these variables if we did a county-level query to
253 # the census API.
254 geo_key_variables = [f for f in dfs[0].columns if f not in set(variable_groups[0])]
256 # Now that we know the geometry keys, we may have to get back the other
257 # geometry columns we left out the first time.
258 if with_geometry and with_geometry_columns:
259 dfs[0] = download(
260 dataset,
261 vintage,
262 variable_groups[0],
263 query_filter=query_filter,
264 api_key=api_key,
265 variable_cache=census_variables,
266 with_geometry=with_geometry,
267 with_geometry_columns=with_geometry_columns,
268 tiger_shapefiles_only=tiger_shapefiles_only,
269 **kwargs,
270 )
272 # If we put in the geometry column, it's not part of the
273 # key.
274 if with_geometry:
275 geo_key_variables = [f for f in geo_key_variables if f != "geometry"]
277 # Now we have to decide if we are going to use the merge
278 # strategy or the concat strategy to combine the data frames
279 # we downloaded. Why do we have two strategies? Because we are
280 # dealing with two kinds of data. One kind, from data sets like
281 # ACS (https://www.census.gov/programs-surveys/acs.html),
282 # has a unique key of columns that specify geography. The other
283 # kind, from data sets like CPS
284 # (https://www.census.gov/programs-surveys/cps.html) doesn't.
285 #
286 # In the unique key case, we can join the data frames that come
287 # back on those key columns and get the final wide data frame
288 # we want for the user.
289 #
290 # In the non-unique key case, we can't do this. There data sets
291 # may have multiple rows for a value of the key columns. We can't
292 # join here. Instead, we can only concatenate the tables
293 # horizontally and hope that the rows came back in the same order
294 # for each of them.
296 # We hope to be able to merge. It is safer. If row_keys is supplied, they are included in
297 # merge keys
298 merge_strategy = True
299 if row_keys:
300 merge_keys = geo_key_variables + row_keys
301 else:
302 merge_keys = geo_key_variables
303 # But if there are any non-unique keys in any df, we can't
304 # merge.
305 for df_slice in dfs:
306 if len(df_slice.value_counts(merge_keys, sort=False)) != len(df_slice.index):
307 merge_strategy = False
308 break
310 if with_geometry and with_geometry_columns and not merge_strategy:
311 raise ValueError(
312 "`with_geometry_columns=True` is only supported for very wide results "
313 "when the merge_strategy can be used. This merge strategy is used when every "
314 "row of the result is for a unique geography, as is the case in data sets like "
315 f'ACS5 ("{ACS5}") and DECENNIAL_PUBLIC_LAW_94_171 ("{DECENNIAL_PUBLIC_LAW_94_171}"). '
316 "If this functionality is really important to you (note that it would create a lot "
317 "of duplicate geoemetry values, we suggest you set `with_geometry=False` in this call "
318 "and then merge with a `GeoDatFrame` with the geometries you want after the fact."
319 )
321 if merge_strategy:
322 # We can do the merge strategy.
324 __dw_strategy_metrics["merge"] = __dw_strategy_metrics["merge"] + 1
326 df_data = dfs[0]
328 for df_right in dfs[1:]:
329 df_data_columns = set(df_data.columns)
330 df_data = df_data.merge(
331 df_right[
332 [
333 col
334 for col in df_right.columns
335 if (col in merge_keys) or (col not in df_data_columns)
336 ]
337 ],
338 on=merge_keys,
339 )
340 else:
341 # We are going to have to fall back on the concat
342 # strategy. Before we do the concat, however, let's
343 # double-check that the key columns are the same in
344 # at the corresponding row in every df. Otherwise, something
345 # is fishy, and it is not safe to concat without mixing
346 # data that should be in different rows.
348 rows0 = len(dfs[0].index)
350 for df_slice in dfs[1:]:
351 if not (
352 rows0 == len(df_slice.index)
353 and dfs[0][geo_key_variables].equals(df_slice[geo_key_variables])
354 ):
355 # At least one difference. So we cannot use the
356 # concat strategy either.
357 if not row_keys:
358 raise CensusApiException(
359 "Neither the merge nor the concat strategy is viable. "
360 "We made multiple queries to the census API because more than "
361 f"{_MAX_VARIABLES_PER_DOWNLOAD} variables were requested. "
362 "If you don't need all the variables, it is always safer to "
363 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. "
364 f"If you need more than {_MAX_VARIABLES_PER_DOWNLOAD}, you can supply the `row_keys`"
365 "arguement with a set of variables that uniquely identify each row."
366 )
367 else:
368 raise CensusApiException(
369 f"Neither the merge nor the concat strategy is viable using row_keys: {row_keys}. "
370 "The supplied keys should uniquely identify every row in the dataset to work. "
371 "If you don't need all the variables, it is always safer to "
372 f"download less than {_MAX_VARIABLES_PER_DOWNLOAD} variables. "
373 )
375 # Concat strategy is as safe as it will ever be. We hope the server
376 # side did not reorder the results across queries.
377 logger.info(
378 "Using the concat strategy, which is not guaranteed reliable if "
379 "the census API returned data for multiple sub-queries of less than "
380 "or equal to %d in different row orders. "
381 "It is always safest to query no more than %d "
382 "variables at a time. Please do so unless you really need them all.",
383 _MAX_VARIABLES_PER_DOWNLOAD,
384 _MAX_VARIABLES_PER_DOWNLOAD,
385 )
387 __dw_strategy_metrics["concat"] = __dw_strategy_metrics["concat"] + 1
389 df_data = pd.concat(
390 [dfs[0]] + [df.drop(geo_key_variables, axis="columns") for df in dfs[1:]],
391 axis="columns",
392 )
394 return df_data
397def download_lodes(
398 dataset: str,
399 vintage: VintageType,
400 download_variables: Optional[Union[str, Iterable[str]]] = None,
401 version: Optional[str] = None,
402 home_geography: Optional[Union[bool, Dict[str, str]]] = None,
403 with_geometry: bool = False,
404 with_geometry_columns: bool = False,
405 tiger_shapefiles_only: bool = False,
406 remove_water: bool = False,
407 **kwargs: cgeo.InSpecType,
408) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
409 """
410 Download LODES data from the US Census API.
412 This is typically not called directly, but instead LODES data
413 is obtained by calling :py:func:`~download`, which then calls
414 this as needed for LODES data sets.
416 Parameters
417 ----------
418 dataset
419 The dataset to download from. For example `"acs/acs5"`,
420 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are
421 symbolic names for datasets, like `ACS5` for `"acs/acs5"
422 in :py:module:`censusdis.datasets`.
423 vintage
424 The vintage to download data for. For most data sets this is
425 an integer year, for example, `2020`. But for
426 a timeseries data set, pass the string `'timeseries'`.
427 download_variables
428 The census variables to download, for example `["NAME", "B01001_001E"]`.
429 with_geometry
430 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row
431 will have a geometry that is a cartographic boundary suitable for platting
432 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
433 for details of the shapefiles that will be downloaded on your behalf to
434 generate these boundaries.
435 with_geometry_columns
436 If `True` keep all the additional columns that come with shapefiles
437 downloaded to get geometry information.
438 tiger_shapefiles_only
439 If `True` only look for TIGER shapefiles. If `False`, first look
440 for CB shapefiles
441 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
442 which are more suitable for plotting maps, then fall back on the full
443 TIGER files
444 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
445 only if CB is not available. This is mainly set to `True` only
446 when `with_geometry_columns` is also set to `True`. The reason
447 is that the additional columns in the shapefiles are different
448 in the CB files than in the TIGER files.
449 remove_water
450 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and
451 remove water areas from returned geometry.
452 """
453 if version is None:
454 version = "LODES8"
456 if isinstance(home_geography, bool):
457 if home_geography:
458 home_geography = dict(**kwargs)
459 else:
460 home_geography = None
462 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs)
463 geo_bindings = bound_path.bindings
465 state = geo_bindings["state"]
467 if state == "*":
468 # TODO - we could just concatenate them all.
469 raise ValueError("Wildcards not supported for state LODES data.")
471 if state not in ABBREVIATIONS_FROM_IDS:
472 raise ValueError(f"Unknown state id {state}")
474 state_name = ABBREVIATIONS_FROM_IDS[state].lower()
476 _, data_set_type, part_or_segment, job_type = dataset.split("/")
478 if data_set_type in ["rac", "wac"]:
479 part_or_segment = part_or_segment.upper()
481 url = (
482 f"https://lehd.ces.census.gov/data/lodes/{version}/{state_name}/{data_set_type}/{state_name}_{data_set_type}_"
483 f"{part_or_segment}_{job_type.upper()}_{vintage}.csv.gz"
484 )
486 logger.info(f"Downloading LODES data from {url}")
488 results = requests.get(url)
489 if results.status_code != requests.status_codes.codes.OK:
490 raise CensusApiException(
491 f"Unable to get LODES data. Attempted to fetch from {url}. "
492 f"Status: {results.status_code}; {results.reason}"
493 )
495 gz_content = requests.get(url).content
496 content = gzip.decompress(gz_content)
497 df_lodes = pd.read_csv(
498 io.StringIO(content.decode("utf-8")), dtype={"w_geocode": str, "h_geocode": str}
499 )
501 # We don't need the date.
502 df_lodes = df_lodes.drop("createdate", axis="columns")
504 # Map the geographies to the conventions censusdis uses.
506 def map_geo_cols(*, from_prefix: str, to_suffix: str = ""):
507 df_lodes[f"STATE{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[:2]
508 df_lodes[f"COUNTY{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[2:5]
509 df_lodes[f"TRACT{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[5:11]
510 df_lodes[f"BLOCK{to_suffix}"] = df_lodes[f"{from_prefix}geocode"].str[11:15]
512 if data_set_type in ["od", "wac"]:
513 map_geo_cols(from_prefix="w_")
514 else:
515 map_geo_cols(from_prefix="h_")
517 if data_set_type == "od":
518 map_geo_cols(from_prefix="h_", to_suffix="_H")
520 for geocode_col in ["w_geocode", "h_geocode"]:
521 if geocode_col in df_lodes.columns:
522 df_lodes.drop(geocode_col, axis="columns")
524 group_keys = []
525 selectors = {}
527 for geo, binding in geo_bindings.items():
528 group_keys.append(f"{geo.upper()}")
529 if binding != "*":
530 selectors[f"{geo.upper()}"] = binding
532 if data_set_type == "od":
533 if home_geography is None:
534 for col in df_lodes.columns:
535 if col.endswith("_H"):
536 group_keys.append(col)
537 else:
538 # There is more grouping to do.
539 home_bound_path = cgeo.PathSpec.partial_prefix_match(
540 dataset, vintage, **home_geography
541 )
542 home_geo_bindings = home_bound_path.bindings
544 for geo, binding in home_geo_bindings.items():
545 group_keys.append(f"{geo.upper()}_H")
546 if binding != "*":
547 selectors[f"{geo.upper()}_H"] = binding
549 # Filter down based on fixed bindings.
550 if selectors:
551 criteria = None
552 for col, binding in selectors.items():
553 if criteria is None:
554 criteria = df_lodes[col] == binding
555 else:
556 criteria = criteria & (df_lodes[col] == binding)
557 df_lodes = df_lodes[criteria]
559 if download_variables is None:
560 download_variables = [
561 col for col in df_lodes.columns if df_lodes[col].dtype != object
562 ]
564 # Group based on group keys.
565 df_lodes = df_lodes.groupby(group_keys)[download_variables].sum().reset_index()
567 if with_geometry:
568 # We need to get the geometry and merge it in.
569 geo_level = bound_path.path_spec.path[-1]
570 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]]
572 gdf_data = add_geography(
573 df_lodes,
574 vintage,
575 shapefile_scope,
576 geo_level,
577 with_geometry_columns=with_geometry_columns,
578 tiger_shapefiles_only=tiger_shapefiles_only,
579 )
581 if remove_water:
582 gdf_data = clip_water(gdf_data, vintage)
584 return gdf_data
586 return df_lodes
589def download(
590 dataset: str,
591 vintage: VintageType,
592 download_variables: Optional[Union[str, Iterable[str]]] = None,
593 *,
594 group: Optional[Union[str, Iterable[str]]] = None,
595 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,
596 set_to_nan: Union[bool, Iterable[int]] = True,
597 skip_annotations: bool = True,
598 query_filter: Optional[Dict[str, str]] = None,
599 with_geometry: bool = False,
600 with_geometry_columns: bool = False,
601 tiger_shapefiles_only: bool = False,
602 remove_water: bool = False,
603 download_contained_within: Optional[Dict[str, cgeo.InSpecType]] = None,
604 area_threshold: float = 0.8,
605 api_key: Optional[str] = None,
606 variable_cache: Optional["VariableCache"] = None,
607 row_keys: Optional[Union[str, Iterable[str]]] = None,
608 **kwargs: cgeo.InSpecType,
609) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
610 """
611 Download data from the US Census API.
613 This is the main API for downloading US Census data with the
614 `censusdis` package. There are many examples of how to use
615 this in the demo notebooks provided with the package at
616 https://github.com/vengroff/censusdis/tree/main/notebooks.
618 *A note on variables and groups*: there are multiple ways to specify the
619 variables you want to download, either individually in `download_variables`,
620 by one or more groups in `group`, and by the leaves of one or more groups
621 in `leaves_of_group`. Note that these three sources af variables are
622 deduplicated, so you will only get one column for a variable no matter
623 how many times it is specified.
625 *Specifying census geographies*: censusdis provides access to many
626 census datasets, each of which can be retrieved at a particular set of
627 geographic grains. To accomodate this, `download()` takes a set
628 of kwargs to define the geographic level of the returned data. You can check
629 which geographies are available for a particular dataset with the
630 `geographies()`.
632 Parameters
633 ----------
634 dataset
635 The dataset to download from. For example `"acs/acs5"`,
636 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are
637 symbolic names for datasets, like `ACS5` for `"acs/acs5"`
638 in :py:module:`censusdis.datasets`.
639 vintage
640 The vintage to download data for. For most data sets this is
641 an integer year, for example, `2020`. But for
642 a timeseries data set, pass the string `'timeseries'`.
643 download_variables
644 The census variables to download, for example `["NAME", "B01001_001E"]`.
645 group
646 One or more groups (as defined by the U.S. Census for the data set)
647 whose variable values should be downloaded. These are in addition to
648 any specified in `download_variables`.
649 leaves_of_group
650 One or more groups (as defined by the U.S. Census for the data set)
651 whose leaf variable values should be downloaded.These are in addition to
652 any specified in `download_variables` or `group`. See
653 :py:meth:`VariableCache.group_leaves` for more details on the semantics of
654 leaves vs. non-leaf group variables.
655 set_to_nan
656 A list of values that should be set to NaN. Normally these are special
657 values that the U.S. Census API sometimes returns. If `True`, then all
658 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.
659 If `False`, no replacements will be made.
660 skip_annotations
661 If `True` try to filter out `group` or `leaves_of_group` variables that are
662 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`
663 for more details. Variable names passed in `download_variables` are not
664 affected by this flag.
665 query_filter
666 A dictionary of values to filter on. For example, if
667 `query_filter={'NAICS2017': '72251'}` then only rows
668 where the variable `NAICS2017` has a value of `'72251'`
669 will be returned.
671 This filtering is done on the server side, not the client
672 side, so it is far more efficient than querying without a
673 query filter and then manually filtering the results.
674 with_geometry
675 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row
676 will have a geometry that is a cartographic boundary suitable for platting
677 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
678 for details of the shapefiles that will be downloaded on your behalf to
679 generate these boundaries.
680 with_geometry_columns
681 If `True` keep all the additional columns that come with shapefiles
682 downloaded to get geometry information.
683 tiger_shapefiles_only
684 If `True` only look for TIGER shapefiles. If `False`, first look
685 for CB shapefiles
686 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
687 which are more suitable for plotting maps, then fall back on the full
688 TIGER files
689 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
690 only if CB is not available. This is mainly set to `True` only
691 when `with_geometry_columns` is also set to `True`. The reason
692 is that the additional columns in the shapefiles are different
693 in the CB files than in the TIGER files.
694 remove_water
695 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and
696 remove water areas from returned geometry.
697 download_contained_within
698 A dictionary specifying the geography or geographies that our results
699 should be filtered down to be contained within.
700 area_threshold
701 What fraction of the area of other geographies must be contained
702 in our geography to be included. Ignored if `download_contained_within` is
703 `None`.
704 api_key
705 An optional API key. If you don't have or don't use a key, the number
706 of calls you can make will be limited to 500 per day.
707 variable_cache
708 A cache of metadata about variables.
709 row_keys
710 An optional set of identifier keys to help merge together requests for more than the census API limit of
711 50 variables per query. These keys are useful for census datasets such as the Current Population Survey
712 where the geographic identifiers do not uniquely identify each row.
713 kwargs
714 A specification of the geometry that we want data for. For example,
715 `state = "*", county = "*"` will download county-level data for
716 the entire US.
718 Returns
719 -------
720 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data.
721 """
722 if dataset.startswith("lodes/"):
723 # Special case for the LODES data sets, which go down a completely
724 # different path.
725 if download_contained_within is not None:
726 raise ValueError(
727 "`download_contained_within` not supported for LODES data sets."
728 )
730 return download_lodes(
731 dataset,
732 vintage,
733 download_variables,
734 with_geometry=with_geometry,
735 with_geometry_columns=with_geometry_columns,
736 tiger_shapefiles_only=tiger_shapefiles_only,
737 remove_water=remove_water,
738 **kwargs,
739 )
741 if download_contained_within is not None:
742 # Put the contained_within context around it.
743 return contained_within(
744 area_threshold=area_threshold, **download_contained_within
745 ).download(
746 dataset,
747 vintage,
748 download_variables,
749 group=group,
750 leaves_of_group=leaves_of_group,
751 set_to_nan=set_to_nan,
752 skip_annotations=skip_annotations,
753 query_filter=query_filter,
754 with_geometry=with_geometry,
755 with_geometry_columns=with_geometry_columns,
756 tiger_shapefiles_only=tiger_shapefiles_only,
757 remove_water=remove_water,
758 api_key=api_key,
759 variable_cache=variable_cache,
760 row_keys=row_keys,
761 **kwargs,
762 )
764 if variable_cache is None:
765 variable_cache = variables
767 # Ensure list operations work
768 if row_keys:
769 row_keys = list(row_keys)
771 # The side effect here is to prime the cache.
772 cgeo.geo_path_snake_specs(dataset, vintage)
774 if set_to_nan is True:
775 set_to_nan = ALL_SPECIAL_VALUES
777 # In case they came to us in py format, as kwargs often do.
778 kwargs = {
779 cgeo.path_component_from_snake(dataset, vintage, k): v
780 for k, v in kwargs.items()
781 }
783 # Parse out the download variables
784 download_variables = _parse_download_variables(
785 dataset,
786 vintage,
787 download_variables=download_variables,
788 group=group,
789 leaves_of_group=leaves_of_group,
790 skip_annotations=skip_annotations,
791 variable_cache=variable_cache,
792 )
794 if len(download_variables) <= _MAX_VARIABLES_PER_DOWNLOAD and row_keys:
795 warnings.warn(
796 "\n The row_keys argument is intended to be used only when the number of requested"
797 "\n variables exceeds the Census defined limit of 50"
798 "\n The supplied value(s) will be ignored",
799 UserWarning,
800 )
801 # Special case if we are trying to get too many fields.
802 if len(download_variables) > _MAX_VARIABLES_PER_DOWNLOAD:
803 return _download_multiple(
804 dataset,
805 vintage,
806 download_variables,
807 api_key=api_key,
808 census_variables=variable_cache,
809 query_filter=query_filter,
810 with_geometry=with_geometry,
811 with_geometry_columns=with_geometry_columns,
812 tiger_shapefiles_only=tiger_shapefiles_only,
813 row_keys=row_keys,
814 **kwargs,
815 )
817 # Prefetch all the types before we load the data.
818 # That way we fail fast if a field is not known.
819 _prefetch_variable_types(dataset, vintage, download_variables, variable_cache)
820 # Also check that the row_keys, if supplied, are present in the dataset
821 if row_keys:
822 _prefetch_variable_types(dataset, vintage, row_keys, variable_cache)
824 # If we were given a list, join it together into
825 # a comma-separated string.
826 string_kwargs = {k: _gf2s(v) for k, v in kwargs.items()}
828 return _download_remote(
829 dataset,
830 vintage,
831 download_variables=download_variables,
832 set_to_nan=set_to_nan,
833 query_filter=query_filter,
834 with_geometry=with_geometry,
835 with_geometry_columns=with_geometry_columns,
836 tiger_shapefiles_only=tiger_shapefiles_only,
837 remove_water=remove_water,
838 api_key=api_key,
839 variable_cache=variable_cache,
840 **string_kwargs,
841 )
844def _download_remote(
845 dataset: str,
846 vintage: VintageType,
847 *,
848 download_variables: List[str],
849 set_to_nan: Union[bool, Iterable[float]] = True,
850 query_filter: Optional[Dict[str, str]] = None,
851 with_geometry: bool,
852 with_geometry_columns: bool,
853 tiger_shapefiles_only: bool,
854 remove_water: bool,
855 api_key: Optional[str],
856 variable_cache: "VariableCache",
857 **kwargs,
858) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
859 """
860 Make the actual remote call to download the data.
862 This is the final step after we have parsed out and
863 validated the variables and geometry.
865 Parameters
866 ----------
867 dataset
868 The dataset to download from. For example `"acs/acs5"`,
869 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
870 vintage
871 The vintage to download data for. For most data sets this is
872 an integer year, for example, `2020`. But for
873 a timeseries data set, pass the string `'timeseries'`.
874 download_variables
875 The census variables to download, for example `["NAME", "B01001_001E"]`.
876 set_to_nan
877 A list of values that should be set to NaN. Normally these are special
878 values that the U.S. Census API sometimes returns. If `True`, then all
879 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.
880 If `False`, no replacements will be made.
881 query_filter
882 A dictionary of values to filter on. For example, if
883 `query_filter={'NAICS2017': '72251'}` then only rows
884 where the variable `NAICS2017` has a value of `'72251'`
885 will be returned.
887 This filtering is done on the server side, not the client
888 side, so it is far more efficient than querying without a
889 query filter and then manually filtering the results.
890 with_geometry
891 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row
892 will have a geometry that is a cartographic boundary suitable for platting
893 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
894 for details of the shapefiles that will be downloaded on your behalf to
895 generate these boundaries.
896 with_geometry_columns
897 If `True` keep all the additional columns that come with shapefiles
898 downloaded to get geometry information.
899 tiger_shapefiles_only
900 If `True` only look for TIGER shapefiles. If `False`, first look
901 for CB shapefiles
902 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
903 which are more suitable for plotting maps, then fall back on the full
904 TIGER files
905 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
906 only if CB is not available. This is mainly set to `True` only
907 when `with_geometry_columns` is also set to `True`. The reason
908 is that the additional columns in the shapefiles are different
909 in the CB files than in the TIGER files.
910 api_key
911 An optional API key. If you don't have or don't use a key, the number
912 of calls you can make will be limited.
913 variable_cache
914 A cache of metadata about variables.
915 kwargs
916 A specification of the geometry that we want data for.
918 Returns
919 -------
920 The downloaded variables, with or without added geometry, as
921 either a `pd.DataFrame` or `gpd.GeoDataFrame`.
922 """
923 url, params, bound_path = census_table_url(
924 dataset,
925 vintage,
926 download_variables,
927 query_filter=query_filter,
928 api_key=api_key,
929 **kwargs,
930 )
931 df_data = data_from_url(url, params)
933 # Coerce the types based on metadata about the variables.
934 _coerce_downloaded_variable_types(
935 dataset, vintage, download_variables, df_data, variable_cache
936 )
938 download_variables_upper = [dv.upper() for dv in download_variables]
940 # Put the geo fields (STATE, COUNTY, etc...) that came back up front.
941 df_data = df_data[
942 [col for col in df_data.columns if col not in download_variables_upper]
943 + download_variables_upper
944 ]
946 # NaN out as requested.
947 if set_to_nan is True:
948 set_to_nan = ALL_SPECIAL_VALUES
949 if set_to_nan:
950 df_data = df_data.replace(list(set_to_nan), np.nan)
952 if with_geometry:
953 # We need to get the geometry and merge it in.
954 geo_level = bound_path.path_spec.path[-1]
955 shapefile_scope = bound_path.bindings[bound_path.path_spec.path[0]]
957 gdf_data = add_geography(
958 df_data,
959 vintage,
960 shapefile_scope,
961 geo_level,
962 with_geometry_columns=with_geometry_columns,
963 tiger_shapefiles_only=tiger_shapefiles_only,
964 )
966 if remove_water:
967 gdf_data = clip_water(gdf_data, vintage)
969 return gdf_data
971 return df_data
974def _coerce_downloaded_variable_types(
975 dataset: str,
976 vintage: VintageType,
977 download_variables: List[str],
978 df_data: pd.DataFrame,
979 variable_cache: "VariableCache",
980) -> None:
981 """
982 Coerce the type of each returned variable (column) in a data frame.
984 We look up the type in the metadata in `variable_cache`.
986 Parameters
987 ----------
988 dataset
989 The dataset to download from. For example `"acs/acs5"`,
990 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
991 vintage
992 The vintage to download data for. For most data sets this is
993 an integer year, for example, `2020`. But for
994 a timeseries data set, pass the string `'timeseries'`.
995 download_variables
996 The census variables to download, for example `["NAME", "B01001_001E"]`.
997 df_data
998 The data that came back in JSON form from the census API.
999 variable_cache
1000 A cache of metadata about variables.
1001 """
1002 for variable in download_variables:
1003 # predicateType does not exist in some older data sets like acs/acs3
1004 # So in that case we just go with what we got in the JSON. But if we
1005 # have it try to set the type.
1006 if "predicateType" in variable_cache.get(dataset, vintage, variable):
1007 field_type = variable_cache.get(dataset, vintage, variable)["predicateType"]
1009 if field_type == "int" or field_type == "long":
1010 if df_data[variable].isnull().any():
1011 # Some Census data sets put in null in int fields.
1012 # We have to go with a float to make this a NaN.
1013 # Int has no representation for NaN or None.
1014 df_data[variable] = df_data[variable].astype(float, errors="ignore")
1015 else:
1016 try:
1017 df_data[variable] = df_data[variable].astype(int)
1018 except ValueError:
1019 # Sometimes census metadata says int, but they
1020 # put in float values anyway, so fall back on
1021 # trying to get them as floats.
1022 df_data[variable] = df_data[variable].astype(
1023 float, errors="ignore"
1024 )
1025 except OverflowError:
1026 # Some long IDs are actually better handled as strings.
1027 df_data[variable] = df_data[variable].astype(str)
1028 elif field_type == "float":
1029 df_data[variable] = df_data[variable].astype(float)
1030 elif field_type == "string":
1031 pass
1032 else:
1033 # Leave it as an object?
1034 pass
1037def _prefetch_variable_types(
1038 dataset: str,
1039 vintage: VintageType,
1040 download_variables: List[str],
1041 variable_cache: "VariableCache",
1042) -> None:
1043 """
1044 Prefetch the types of all the variables we are going to try to download.
1046 This enables us to fail fast and have a better error message about the
1047 root cause of the issue than if we just blindly put in the variable names
1048 in the census API request and wait for it to fail.
1050 Parameters
1051 ----------
1052 dataset
1053 The dataset to download from. For example `"acs/acs5"`,
1054 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
1055 vintage
1056 The vintage to download data for. For most data sets this is
1057 an integer year, for example, `2020`. But for
1058 a timeseries data set, pass the string `'timeseries'`.
1060 download_variables
1061 The census variables to download, for example `["NAME", "B01001_001E"]`.
1062 variable_cache
1063 A cache of metadata about variables.
1064 """
1065 for variable in download_variables:
1066 try:
1067 variable_cache.get(dataset, vintage, variable)
1068 except Exception as exc:
1069 census_url = CensusApiVariableSource.url(
1070 dataset, vintage, variable, response_format="html"
1071 )
1072 census_variables_url = CensusApiVariableSource.variables_url(
1073 dataset, vintage, response_format="html"
1074 )
1076 raise CensusApiException(
1077 f"Unable to get metadata on the variable {variable} from the "
1078 f"dataset {dataset} for year {vintage} from the census API. "
1079 f"Check the census URL for the variable ({census_url}) to ensure it exists. "
1080 f"If not found, check {census_variables_url} for all variables in the dataset."
1081 ) from exc
1084def _parse_download_variables(
1085 dataset: str,
1086 vintage: VintageType,
1087 *,
1088 download_variables: Optional[Union[str, Iterable[str]]] = None,
1089 group: Optional[Union[str, Iterable[str]]] = None,
1090 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,
1091 skip_annotations: bool = True,
1092 variable_cache: Optional["VariableCache"] = None,
1093) -> List[str]:
1094 """
1095 Parse out the full set of download variables.
1097 These may be encoded in `download_variables`, `group`, and/or `leaves_of_group`.
1099 Parameters
1100 ----------
1101 dataset
1102 The dataset to download from. For example `"acs/acs5"`,
1103 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
1104 vintage
1105 The vintage to download data for. For most data sets this is
1106 an integer year, for example, `2020`. But for
1107 a timeseries data set, pass the string `'timeseries'`.
1108 download_variables
1109 The census variables to download, for example `["NAME", "B01001_001E"]`.
1110 group
1111 One or more groups (as defined by the U.S. Census for the data set)
1112 whose variable values should be downloaded. These are in addition to
1113 any specified in `download_variables`.
1114 leaves_of_group
1115 One or more groups (as defined by the U.S. Census for the data set)
1116 whose leaf variable values should be downloaded.These are in addition to
1117 any specified in `download_variables` or `group`. See
1118 :py:meth:`VariableCache.group_leaves` for more details on the semantics of
1119 leaves vs. non-leaf group variables.
1120 skip_annotations
1121 If `True` try to filter out `group` or `leaves_of_group` variables that are
1122 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`
1123 for more details. Variable names passed in `download_variables` are not
1124 affected by this flag.
1125 variable_cache
1126 A cache of metadata about variables.
1128 Returns
1129 -------
1130 The fully expanded list of variables to download.
1131 """
1132 # Turn the variables we were given into a list if they are not already.
1133 if download_variables is None:
1134 download_variables = []
1135 elif isinstance(download_variables, str):
1136 download_variables = [download_variables]
1137 elif not isinstance(download_variables, list):
1138 download_variables = list(download_variables)
1140 if group is None:
1141 group = []
1142 elif isinstance(group, str):
1143 group = [group]
1145 if leaves_of_group is None:
1146 leaves_of_group = []
1147 elif isinstance(leaves_of_group, str):
1148 leaves_of_group = [leaves_of_group]
1150 # Add group variables and leaves as appropriate.
1151 group_variables: List[str] = []
1152 for group_name in group:
1153 group_variables = group_variables + variable_cache.group_variables(
1154 dataset, vintage, group_name, skip_annotations=skip_annotations
1155 )
1156 group_leaf_variables: List[str] = []
1157 for group_name in leaves_of_group:
1158 group_leaf_variables = group_leaf_variables + variable_cache.group_leaves(
1159 dataset, vintage, group_name, skip_annotations=skip_annotations
1160 )
1162 # Concatenate them all.
1163 download_variables = download_variables + group_variables + group_leaf_variables
1165 # Dedup and maintain order.
1166 download_variables = list(dict.fromkeys(download_variables))
1168 return download_variables
1171def census_table_url(
1172 dataset: str,
1173 vintage: VintageType,
1174 download_variables: Iterable[str],
1175 *,
1176 query_filter: Optional[Dict[str, str]] = None,
1177 api_key: Optional[str] = None,
1178 **kwargs: cgeo.InSpecType,
1179) -> Tuple[str, Mapping[str, str], cgeo.BoundGeographyPath]:
1180 """
1181 Construct the URL to download data from the U.S. Census API.
1183 Parameters
1184 ----------
1185 dataset
1186 The dataset to download from. For example `"acs/acs5"`,
1187 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
1188 vintage
1189 The vintage to download data for. For most data sets this is
1190 an integer year, for example, `2020`. But for
1191 a timeseries data set, pass the string `'timeseries'`.
1192 download_variables
1193 The census variables to download, for example `["NAME", "B01001_001E"]`.
1194 query_filter
1195 A dictionary of values to filter on. For example, if
1196 `query_filter={'NAICS2017': '72251'}` then only rows
1197 where the variable `NAICS2017` has a value of `'72251'`
1198 will be returned.
1200 This filtering is done on the server side, not the client
1201 side, so it is far more efficient than querying without a
1202 query filter and then manually filtering the results.
1203 api_key
1204 An optional API key. If you don't have or don't use a key, the number
1205 of calls you can make will be limited.
1206 kwargs
1207 A specification of the geometry that we want data for.
1209 Returns
1210 -------
1211 The URL, parameters and bound path.
1213 """
1214 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs)
1216 query_spec = cgeo.CensusGeographyQuerySpec(
1217 dataset, vintage, list(download_variables), bound_path, api_key=api_key
1218 )
1220 url, params = query_spec.table_url(query_filter=query_filter)
1222 return url, params, bound_path
1225def _bind_path_if_possible(dataset, vintage, **kwargs):
1226 """
1227 Bind the path if possible.
1229 If not, raise an exception with enough info to fix it.
1230 """
1231 bound_path = cgeo.PathSpec.partial_prefix_match(dataset, vintage, **kwargs)
1232 if bound_path is None:
1233 if kwargs:
1234 path_specs = cgeo.geo_path_snake_specs(dataset, vintage)
1235 possible_path_spec_keys = set(
1236 geo for path_name in path_specs.values() for geo in path_name
1237 )
1239 non_geo_kwargs = [
1240 k for k in kwargs.keys() if k not in possible_path_spec_keys
1241 ]
1243 if non_geo_kwargs:
1244 msg = f"""
1245The following arguments are not recognized as non-geographic arguments or goegraphic arguments
1246for the dataset {dataset} in vintage {vintage}: '{"', '".join(non_geo_kwargs)}'.
1248There are two reasons why this might happen:
12501. The arg(s) mentioned above are mispelled versions of named or geopgrahic arguments.
12512. The arg(s) mentioned above are valid geographic arguments for some data sets and
1252 vintages, but not for {dataset} in vintage {vintage}.
1254"""
1255 else:
1256 msg = f"""
1257Unable to match the geography specification {kwargs}.
1259"""
1261 raise CensusApiException(
1262 f"{msg}"
1263 f"Supported geographies for dataset='{dataset}' in year={vintage} are:\n"
1264 + "\n".join(
1265 f"{path_spec}"
1266 for path_spec in cgeo.geo_path_snake_specs(
1267 dataset, vintage
1268 ).values()
1269 )
1270 )
1271 else:
1272 bound_path = cgeo.BoundGeographyPath("000", cgeo.PathSpec.empty_path_spec())
1274 return bound_path
1277def geography_names(
1278 dataset: str,
1279 vintage: VintageType,
1280 **kwargs: cgeo.InSpecType,
1281) -> pd.DataFrame:
1282 """
1283 Get the name of a specific geography.
1285 The arguments are a subset of those to :py:func:`~download`. This
1286 function is designed to make it easy to fetch the name of a geography
1287 when we know the FIPS code but want a human-readable name or label for
1288 display.
1290 Parameters
1291 ----------
1292 dataset
1293 The dataset to download from. For example `censusdis.datasets.ACS5`.
1294 vintage
1295 The vintage to download data for. For example, `2020`.
1296 kwargs
1297 A specification of the geometry that we want data for. For example,
1298 `state = "34", county = "017"` will download the name of Hudson County,
1299 New Jersey.
1301 Returns
1302 -------
1303 A dataframe with columns specifying the geography and one for the name.
1304 All column names will be in ALL CAPS.
1305 """
1306 df = download(dataset, vintage, ["NAME"], **kwargs)
1308 return df
1311def geographies(dataset: str, vintage: VintageType) -> List[List[str]]:
1312 """
1313 Determine what geographies are supported for a dataset and vintage.
1315 This utility gives us a list of the different geography
1316 keywords we can use in calls to :py:func:`download` with
1317 for the given dataset and vintage.
1319 Parameters
1320 ----------
1321 dataset
1322 The dataset to download from. For example `"acs/acs5"`,
1323 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`.
1324 vintage
1325 The vintage to download data for. For most data sets this is
1326 an integer year, for example, `2020`. But for
1327 a timeseries data set, pass the string `'timeseries'`.
1329 Returns
1330 -------
1331 A list of lists of geography keywords. Each element
1332 of the outer list is a list of keywords that can be
1333 used together.
1334 """
1335 return list(cgeo.geo_path_snake_specs(dataset, vintage).values())
1338variables = VariableCache()
1341def _intersecting_geos_kws(
1342 dataset: str,
1343 vintage: VintageType,
1344 containing_geo_kwargs: cgeo.InSpecType,
1345 **kwargs: cgeo.InSpecType,
1346) -> cgeo.InSpecType:
1347 """
1348 Construct geography keywords for intersecting geographies.
1350 Parameters
1351 ----------
1352 dataset
1353 The dataset to download from. For example `"acs/acs5"`,
1354 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are
1355 symbolic names for datasets, like `ACS5` for `"acs/acs5"
1356 in :py:module:`censusdis.datasets`.
1357 vintage
1358 The vintage to download data for. For most data sets this is
1359 an integer year, for example, `2020`. But for
1360 a timeseries data set, pass the string `'timeseries'`.
1361 containing_geo_kwargs
1362 Geographic keywords specifying the containing geography that we are
1363 looking for intersections with. For example
1364 `dict(metropolitan_statistical_area_micropolitan_statistical_area="35620")`
1365 for the New York area CBSA.
1366 kwargs
1367 A specification of the geometry that we want data for, limited to those
1368 geographies that are contained in the geography specified by `containing_geo_kwargs`.
1369 For example, `state="*", county="*", tract="*"` will specifies county-level data for
1370 all counties contained in the containing geography.
1372 Returns
1373 -------
1374 A dictionary of geographic keywords suitable for passing to :py:func:`~download`.
1375 """
1376 # This is a fast short circuit if there is only one
1377 # element of kwargs or the first component
1378 # is already specified. The former is since we will have to
1379 # query with the kwargs as they are, so we might as well
1380 # just let our caller do it. The second case is because
1381 # we might trim down the list, but more likely the user
1382 # double specified at the top level, like state=.
1383 if len(kwargs) == 1 or list(kwargs.values())[0] != "*":
1384 return kwargs
1386 # Download the geometry of the outer scope.
1387 gdf_within = download(
1388 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs
1389 )
1391 # See if we can find a matching path spec.
1392 bound_path = _bind_path_if_possible(dataset, vintage, **kwargs)
1394 # Get the geography for the outermost level of the match.
1395 first_binding = list(bound_path.bindings.items())[0]
1397 containing_geo_kwargs = {first_binding[0]: first_binding[1]}
1399 gdf_first_binding = download(
1400 dataset, vintage, ["NAME"], with_geometry=True, **containing_geo_kwargs
1401 )
1403 # Which of the first binding geographies intersect
1404 # the area we want our final geographies to be in.
1405 gdf_intersects = gdf_first_binding.sjoin(
1406 gdf_within, lsuffix="FIRST", rsuffix="within"
1407 )
1409 col_name = first_binding[0].replace(" ", "_").upper()
1410 if col_name not in gdf_intersects.columns:
1411 col_name = f"{col_name}_FIRST"
1413 intersecting_geographies = list(gdf_intersects[col_name].unique())
1415 # Short circuit if there are a massive number of intersection
1416 # geps. In this case, we'll just leave things as they came with
1417 # the leading '*' and query them all. Otherwise the URL gets super
1418 # long and things go a little crazy. This can happen with zip code
1419 # tabulation areas.
1420 if len(intersecting_geographies) > 20:
1421 return dict(**kwargs)
1423 intersecting_geographies = [
1424 geo[:-6] if geo.endswith("_FIRST") else geo for geo in intersecting_geographies
1425 ]
1427 geo = dict(bound_path.bindings)
1429 geo[first_binding[0]] = intersecting_geographies
1431 return geo
1434class ContainedWithin:
1435 """A representation of a geography that we want to query some other geographies that are contained within."""
1437 def __init__(self, area_threshold: float = 0.8, **kwargs: cgeo.InSpecType):
1438 """
1439 Construct a representation of a geography that we want to query some other geographies contained within.
1441 Parameters
1442 ----------
1443 area_threshold
1444 What fraction of the area of other geographies must be contained
1445 in our geography to be included.
1446 kwargs
1447 A specification of the geometry that we want data for geometries
1448 that are contained within. For example,
1449 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ.
1450 """
1451 self._area_threshold = area_threshold
1452 self._containing_kwargs = kwargs
1454 def __eq__(self, other) -> bool:
1455 """Are two objects equal."""
1456 if not isinstance(other, ContainedWithin):
1457 return False
1459 return (
1460 self._area_threshold == other._area_threshold
1461 and self._containing_kwargs == other._containing_kwargs
1462 )
1464 def __enter__(self) -> "ContainedWithin":
1465 """
1466 Enter the context.
1468 Returns
1469 -------
1470 The ContainedWithin object for use within the context.
1471 """
1472 return self
1474 def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1475 """Exit the context."""
1476 pass
1478 def download(
1479 self,
1480 dataset: str,
1481 vintage: VintageType,
1482 download_variables: Optional[Union[str, Iterable[str]]] = None,
1483 *,
1484 group: Optional[Union[str, Iterable[str]]] = None,
1485 leaves_of_group: Optional[Union[str, Iterable[str]]] = None,
1486 set_to_nan: Union[bool, Iterable[int]] = True,
1487 skip_annotations: bool = True,
1488 query_filter: Optional[Dict[str, str]] = None,
1489 with_geometry: bool = False,
1490 with_geometry_columns: bool = False,
1491 tiger_shapefiles_only: bool = False,
1492 remove_water: bool = False,
1493 api_key: Optional[str] = None,
1494 variable_cache: Optional["VariableCache"] = None,
1495 row_keys: Optional[Union[str, Iterable[str]]] = None,
1496 **kwargs: cgeo.InSpecType,
1497 ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
1498 """
1499 Download data for geographies contained within a containing geography.
1501 Parameters
1502 ----------
1503 dataset
1504 The dataset to download from. For example `"acs/acs5"`,
1505 `"dec/pl"`, or `"timeseries/poverty/saipe/schdist"`. There are
1506 symbolic names for datasets, like `ACS5` for `"acs/acs5"
1507 in :py:module:`censusdis.datasets`.
1508 vintage
1509 The vintage to download data for. For most data sets this is
1510 an integer year, for example, `2020`. But for
1511 a timeseries data set, pass the string `'timeseries'`.
1512 download_variables
1513 The census variables to download, for example `["NAME", "B01001_001E"]`.
1514 group
1515 One or more groups (as defined by the U.S. Census for the data set)
1516 whose variable values should be downloaded. These are in addition to
1517 any specified in `download_variables`.
1518 leaves_of_group
1519 One or more groups (as defined by the U.S. Census for the data set)
1520 whose leaf variable values should be downloaded.These are in addition to
1521 any specified in `download_variables` or `group`. See
1522 :py:meth:`VariableCache.group_leaves` for more details on the semantics of
1523 leaves vs. non-leaf group variables.
1524 set_to_nan
1525 A list of values that should be set to NaN. Normally these are special
1526 values that the U.S. Census API sometimes returns. If `True`, then all
1527 values in :py:ref:`censusdis.values.ALL_SPECIAL_VALUES` will be replaced.
1528 If `False`, no replacements will be made.
1529 skip_annotations
1530 If `True` try to filter out `group` or `leaves_of_group` variables that are
1531 annotations rather than actual values. See :py:meth:`VariableCache.group_variables`
1532 for more details. Variable names passed in `download_variables` are not
1533 affected by this flag.
1534 query_filter
1535 A dictionary of values to filter on. For example, if
1536 `query_filter={'NAICS2017': '72251'}` then only rows
1537 where the variable `NAICS2017` has a value of `'72251'`
1538 will be returned.
1540 This filtering is done on the server side, not the client
1541 side, so it is far more efficient than querying without a
1542 query filter and then manually filtering the results.
1543 with_geometry
1544 If `True` a :py:class:`gpd.GeoDataFrame` will be returned and each row
1545 will have a geometry that is a cartographic boundary suitable for platting
1546 a map. See https://www.census.gov/geographies/mapping-files/time-series/geo/cartographic-boundary.2020.html
1547 for details of the shapefiles that will be downloaded on your behalf to
1548 generate these boundaries.
1549 with_geometry_columns
1550 If `True` keep all the additional columns that come with shapefiles
1551 downloaded to get geometry information.
1552 tiger_shapefiles_only
1553 If `True` only look for TIGER shapefiles. If `False`, first look
1554 for CB shapefiles
1555 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
1556 which are more suitable for plotting maps, then fall back on the full
1557 TIGER files
1558 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
1559 only if CB is not available. This is mainly set to `True` only
1560 when `with_geometry_columns` is also set to `True`. The reason
1561 is that the additional columns in the shapefiles are different
1562 in the CB files than in the TIGER files.
1563 remove_water
1564 If `True` and if with_geometry=True, will query TIGER for AREAWATER shapefiles and
1565 remove water areas from returned geometry.
1566 api_key
1567 An optional API key. If you don't have or don't use a key, the number
1568 of calls you can make will be limited to 500 per day.
1569 variable_cache
1570 A cache of metadata about variables.
1571 row_keys
1572 An optional set of identifier keys to help merge together requests for more than the census API limit of
1573 50 variables per query. These keys are useful for census datasets such as the Current Population Survey
1574 where the geographic identifiers do not uniquely identify each row.
1575 kwargs
1576 A specification of the geometry that we want data for. For example,
1577 `state = "*", county = "*"` will download county-level data for
1578 the entire US.
1580 Returns
1581 -------
1582 A :py:class:`~pd.DataFrame` or `~gpd.GeoDataFrame` containing the requested US Census data.
1583 """
1584 geos_kwargs = _intersecting_geos_kws(
1585 dataset, vintage, self._containing_kwargs, **kwargs
1586 )
1588 gdf = download(
1589 dataset,
1590 vintage,
1591 download_variables,
1592 group=group,
1593 leaves_of_group=leaves_of_group,
1594 set_to_nan=set_to_nan,
1595 skip_annotations=skip_annotations,
1596 query_filter=query_filter,
1597 with_geometry=True,
1598 with_geometry_columns=with_geometry_columns,
1599 tiger_shapefiles_only=tiger_shapefiles_only,
1600 remove_water=remove_water,
1601 api_key=api_key,
1602 variable_cache=variable_cache,
1603 row_keys=row_keys,
1604 **geos_kwargs,
1605 )
1607 # See which of these geometries are mostly contained by
1608 # the geography we want to be within.
1610 gdf_container = download(
1611 dataset, vintage, ["NAME"], with_geometry=True, **self._containing_kwargs
1612 ).drop("NAME", axis="columns")
1614 gdf_contained = cmap.sjoin_mostly_contains(
1615 gdf_container, gdf, area_threshold=self._area_threshold
1616 )
1618 # Drop all the large container columns we don't need.
1619 gdf_contained = gdf_contained[
1620 [col for col in gdf_contained.columns if not col.endswith("_large")]
1621 ].reset_index(drop=True)
1623 # Drop the "_small" suffix.
1625 gdf_contained.rename(
1626 lambda col: col[:-6] if col.endswith("_small") else col,
1627 axis="columns",
1628 inplace=True,
1629 )
1631 if with_geometry:
1632 # Keep the columns from the larger result.
1633 return gdf_contained[
1634 [
1635 col
1636 for col in gdf_container.columns
1637 if col in gdf_contained.columns and col != "geometry"
1638 ]
1639 + [
1640 col
1641 for col in gdf_contained.columns
1642 if col not in gdf_container.columns or col == "geometry"
1643 ]
1644 ]
1645 else:
1646 # Drop the geometry and return a `pd.DataFrame`
1647 return pd.DataFrame(
1648 gdf_contained[
1649 [
1650 col
1651 for col in gdf_container.columns
1652 if col in gdf_contained.columns and col != "geometry"
1653 ]
1654 + [
1655 col
1656 for col in gdf_contained.columns
1657 if col not in gdf_container.columns and col != "geometry"
1658 ]
1659 ]
1660 )
1663def contained_within(
1664 area_threshold: float = 0.8, **kwargs: cgeo.InSpecType
1665) -> ContainedWithin:
1666 """
1667 Construct a representation of a geography that we want to query some other geographies contained within.
1669 Parameters
1670 ----------
1671 area_threshold
1672 What fraction of the area of other geographies must be contained
1673 in our geography to be included.
1674 kwargs
1675 A specification of the geometry that we want data for geometries
1676 that are contained within. For example,
1677 `state = "NJ", place = "01960"` will specify the city of Asbury Park, NJ.
1678 """
1679 return ContainedWithin(area_threshold=area_threshold, **kwargs)
1682def add_inferred_geography(
1683 df_data: pd.DataFrame,
1684 year: Optional[int] = None,
1685 *,
1686 with_geometry_columns: bool = False,
1687 tiger_shapefiles_only: bool = False,
1688) -> gpd.GeoDataFrame:
1689 """
1690 Infer the geography level of the given dataframe.
1692 Add geometry to each row for the inferred level.
1694 See Also
1695 --------
1696 :py:ref:`~infer_geo_level` for more on how inference is done.
1698 Parameters
1699 ----------
1700 df_data
1701 A dataframe of variables with one or more columns that
1702 can be used to infer what geometry level the rows represent.
1703 year
1704 The year for which to fetch geometries. We need this
1705 because they change over time. If `None`, look for a
1706 `'YEAR'` column in `df_data` and possibly add different
1707 geometries for different years as needed.
1708 with_geometry_columns
1709 If `True` keep all the additional columns that come with shapefiles
1710 downloaded to get geometry information.
1711 tiger_shapefiles_only
1712 If `True` only look for TIGER shapefiles. If `False`, first look
1713 for CB shapefiles
1714 (https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html),
1715 which are more suitable for plotting maps, then fall back on the full
1716 TIGER files
1717 (https://www.census.gov/geographies/mapping-files/time-series/geo/tiger-line-file.html)
1718 only if CB is not available. This is mainly set to `True` only
1719 when `with_geometry_columns` is also set to `True`. The reason
1720 is that the additional columns in the shapefiles are different
1721 in the CB files than in the TIGER files.
1723 Returns
1724 -------
1725 A geo data frame containing the original data augmented with
1726 the appropriate geometry for each row.
1727 """
1728 if year is None:
1729 # We'll try to get the year out of the data.
1730 if "YEAR" not in df_data.columns:
1731 raise ValueError(
1732 "If year is None then there must be a `YEAR` column in the data."
1733 )
1735 return gpd.GeoDataFrame(
1736 df_data.groupby("YEAR", group_keys=True, sort=False)
1737 .apply(
1738 lambda df_group: add_inferred_geography(
1739 df_group,
1740 df_group.name,
1741 with_geometry_columns=with_geometry_columns,
1742 tiger_shapefiles_only=tiger_shapefiles_only,
1743 ),
1744 include_groups=False,
1745 )
1746 .reset_index(level=1, drop=True)
1747 .reset_index(drop=False)
1748 )
1750 geo_level = infer_geo_level(year, df_data)
1752 (
1753 shapefile_scope,
1754 _,
1755 shapefile_scope_columns,
1756 _,
1757 ) = geo_query_from_data_query_inner_geo(year, geo_level)
1759 if shapefile_scope is not None:
1760 # The scope is the same across the board.
1761 gdf = add_geography(
1762 df_data,
1763 year,
1764 shapefile_scope,
1765 geo_level,
1766 with_geometry_columns=with_geometry_columns,
1767 tiger_shapefiles_only=tiger_shapefiles_only,
1768 )
1769 return gdf
1771 # We have to group by different values of the shapefile
1772 # scope from the appropriate column and add the right
1773 # geography to each group.
1774 shapefile_scope_column = shapefile_scope_columns[0]
1776 df_with_geo = (
1777 df_data.set_index(shapefile_scope_column)
1778 .groupby(level=shapefile_scope_column, group_keys=True, sort=False)
1779 .apply(
1780 lambda g: add_geography(
1781 g,
1782 year,
1783 g.name,
1784 geo_level,
1785 with_geometry_columns=with_geometry_columns,
1786 tiger_shapefiles_only=tiger_shapefiles_only,
1787 )
1788 )
1789 .reset_index(level=1, drop=True)
1790 .reset_index(drop=False)
1791 )
1793 gdf = gpd.GeoDataFrame(df_with_geo)
1795 return gdf
1798certificates = censusdis.impl.fetch.certificates